/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: mx__lib.c,v 1.354 2006/12/13 20:47:40 reese Exp $";

#include "mx_auto_config.h"
#include "myriexpress.h"
#include "mx__lib_types.h"
#include "mx__lib.h"
#include "mx__fops.h"
#include "mx__lib.h"
#include "mx_util.h"
#include "mx__request.h"
#include "mx__driver_interface.h"
#include "mx__regcache.h"
#include "mx__partner.h"
#include "mx__endpoint.h"
#include "mx__error.h"
#include "mx_byteswap.h"
#include "mx_pin.h"
#include "mx__segment.h"
#include "mx_stbar.h"
#include "mx__handle_map.h"
#include "mx__stack.h"
#include "mx__block.h"
#include "mx__memory_pool.h"
#include "mcp_config.h"
#include "mx__requests.h"
#include "mx_connect.h"
#include "mx__mcp_request_ring.h"
#include "mx__sleep.h"
#include "mx__shmem.h"
#include "mx__shim.h"
#include "mx__internals.h"
#include "mx__wait_queue.h"
#include "mx__wire.h"
#include "mx__debug_dump.h"
#include "mx__ack.h"

/* enable copy for vectorial communication
 * at least until there's a real support in the MCP */
#define MX_VECT_COPY 1

/* TODO: Get values from driver instead of header. */

#ifndef MX_KERNEL
uint32_t mx_max_native_endpoints;
#endif

#if MX_RUNTIME_OPT == 1
struct mx__opt mx__opt = {0 };
#endif

static void mx__send_acked_and_mcp_complete(mx_endpoint_t ep, union mx_request *r,
					    mx_status_code_t status);
static void mx__release_recv_large(struct mx_endpoint *ep, union mx_request *r, 
				   mx_status_code_t status);
static void mx__handle_liback(mx_endpoint_t,struct mx__partner *,uint16_t);

static void mx__rndv_got_notify(mx_endpoint_t ep, unsigned rdma_id,
			     uint32_t length, unsigned seqno);

#if MX_NO_RNDV
static void mx__post_fake_rndv(int ze, mcp_ureq_t *req,
			       struct mx__partner *partner, uint64_t match_info,
			       uint32_t msg_length, uint16_t seqnum, uint16_t cookie, 
			       uint8_t rdma_id, uint8_t rdma_seqnum, uint16_t offset);
#endif


MX_FUNC(mx_return_t)
mx_open_board(int i, mx_endpt_handle_t *handle)
{
  return mx__open(i, -1, handle);
}

MX_FUNC(mx_return_t)
mx_open_any_board(mx_endpt_handle_t *rethandle)
{
  int i;
  mx_endpt_handle_t handle;
  int dev_entry_found;
  int driver_ok;
  int perm_ok;
  mx_return_t ret;

  if (!Mx_init_count)
    return MX_NOT_INITIALIZED;

  /* so far, no good devices, no driver, and no permissions */
  dev_entry_found = 0;
  driver_ok = 0;
  perm_ok = 0;

  /* TODO: Magic number. */
  for (i = 0; i <= 255; ++i) {
    ret = mx__open(i, -1, &handle);

    if (ret == MX_SUCCESS) {
      *rethandle = handle;
      return MX_SUCCESS;

    } else if (ret == MX_NO_DEV) {
      break;
    } else {
      dev_entry_found = 1;

      if (ret == MX_NO_DRIVER) {
	break;
      } else {
        driver_ok = 1;

	if (ret != MX_NO_PERM) {
	  perm_ok = 1;
	}
      }
    }
  }

  if (!dev_entry_found) {
    return MX_NO_DEV;
  }

  if (!driver_ok) {
    return MX_NO_DRIVER;
  }

  if (!perm_ok) {
    return MX_NO_PERM;
  }

  /* default to no resources */
  return MX_NO_RESOURCES;
}


int
mx__queue_large_recv(mx_endpoint_t ep, union mx_request *r)
{
  
  r->recv.basic.type =  MX__REQUEST_TYPE_RECV_LARGE;
  mx_assert ((r->recv.basic.state & ~MX__REQUEST_STATE_DEAD)
    == (MX__REQUEST_STATE_PENDING|MX__REQUEST_STATE_RECV_MATCHED));
  r->recv.basic.state = MX__REQUEST_STATE_RECV_MATCHED | MX__REQUEST_STATE_SEND_QUEUED
    | (r->recv.basic.state & MX__REQUEST_STATE_DEAD);
  /* FIXME: why can't we try to push the request to the MCP now ? */
#if MX_NO_RNDV
  r->recv.notifying =  0;
#else
  r->recv.notifying =  (r->recv.basic.status.xfer_length == 0);
#endif
  r->basic.requeued = 0;
  mx__enqueue_request(&ep->send_reqq, r);
  return MX_SUCCESS;
}

void
mx__liback(mx_endpoint_t ep, struct mx__partner *partner)
{
  unsigned fully_recv_seq;
  if (!mx__isempty_partner_request_queue(&partner->partialq)) {
    fully_recv_seq = mx__first_partner_request(&partner->partialq)->recv.msg_seq;
  } else {
    fully_recv_seq = partner->recv_seq;
  }
  if (fully_recv_seq != partner->fully_recv_seq) {
    partner->fully_recv_seq = fully_recv_seq;
    if (!partner->liback_pending) {
      if (MX__SEQNO(fully_recv_seq - partner->recv_acked) >= mx__opt.imm_ack) {
	mx__queue_liback(ep, partner, 1);
      } else if (partner->ack_list.tqe_prev == 0) {
	partner->oldest_recv_time = mx_jiffies(ep);
	TAILQ_INSERT_TAIL(&ep->partners_to_ack, partner, ack_list);
	if (!ep->timer && !mx__opt.monothread) {
	  ep->timer = 1;
	  mx__arm_timer(ep->handle, 100);
	}
      }
    }
  }
}

static void
mx_unpin(struct mx_endpoint *ep, uint32_t rdma_id, int send)
{
#if !MX_NO_RNDV
  int cc;
  cc = mx__deregister(ep->handle, rdma_id);
  if (cc != MX_SUCCESS) {
    if (mx_errno == EIO)
      mx_fatal("mx__deregister failed, check kernel logs for error messages");
    else
      mx_always_assert(0);
  }
#endif
  mx_rdma_unuse(&ep->rdmas, rdma_id, send);
}

MX_FUNC(void)
mx__regcache_clean(void *ptr, size_t len)
{
  struct mx__rdmawin *r;
  struct mx_endpoint *ep;
  uintptr_t addr = (uintptr_t) ptr;

  if (!mx__opt.rcache)
    return;
  MX__MUTEX_LOCK(&Mx_rcache_lock);
  for (ep = Mx_endpoints; ep; ep = ep->next) {
    struct mx__rdmawin *next;
    for (r = TAILQ_FIRST(&ep->rdmawin_pinned); r; r = next) {
      next = TAILQ_NEXT(r, list);
      if (MX_MIN(r->addr + r->len, addr + len) > MX_MAX(r->addr, addr)) {
	if (mx__opt.verbose) {
	  mx_printf_once("mx__regcache_clean hit:OK\n");
	}
	TAILQ_REMOVE(&ep->rdmawin_pinned, r,list);
	mx_unpin(r->ep, r->rdma_id, r->send);
	TAILQ_INSERT_HEAD(&ep->rdmawin_free,r,list);
      }
    }
  }
  MX__MUTEX_UNLOCK(&Mx_rcache_lock);
}

static void
mx_rdmawin_get(struct mx_endpoint *ep, mx_reg_t *reg_area, int send)
{
  int cc;
  struct mx__rdmawin *r;
  mx_reg_seg_t *seg = &reg_area->segs;

  /* no regcache for multiple segments for now */
  if (!MX_NO_RNDV && mx__opt.rcache && reg_area->nsegs == 1) {
    MX__MUTEX_LOCK(&Mx_rcache_lock);
    TAILQ_FOREACH(r, &ep->rdmawin_pinned, list) {
      if (r->send == send
	  && r->addr == seg->vaddr
	  && r->addr + r->len == seg->vaddr + seg->len) {
	break;
      }
    }
    if (r) {
      /* hit shortcut */
      MX__EP_STATS_INC(ep, rcache_hit);
      MX__EP_STATS_ADD(ep, rcache_hit_kbytes, seg->len / 1024);
      reg_area->rdma_id = r->rdma_id;
      TAILQ_REMOVE(&ep->rdmawin_pinned, r, list);
      TAILQ_INSERT_HEAD(&ep->rdmawin_free, r, list);
      MX__MUTEX_UNLOCK(&Mx_rcache_lock);
      goto do_register;
    }
    MX__MUTEX_UNLOCK(&Mx_rcache_lock);
    MX__EP_STATS_INC(ep, rcache_miss);
    MX__EP_STATS_ADD(ep, rcache_miss_kbytes, reg_area->segs.len / 1024);
  }
  reg_area->rdma_id = mx_rdma_allocate_slot(&ep->rdmas, send);
  if (MX_NO_RNDV)
    return;

 do_register:
  mx_assert(reg_area->rdma_id >= 0);
#if MX_DRIVER_API_MAGIC >= 0x500
  reg_area->seqnum = ep->rdma_requests[reg_area->rdma_id].seqno;
#endif
#if MX_OS_UDRV
  if (seg->vaddr < (size_t)mx__init_brk ||
      seg->vaddr > (size_t)mx__init_brk + 256*1024*1024) {
    struct mx_rdma_req *rdma = ep->rdma_requests + reg_area->rdma_id;
    mx_assert(rdma->copy == 0);
    rdma->copy = memalign(MX_RDMA_PAGE_SIZE, seg->len);
    rdma->copy_offset = (size_t)rdma->copy - seg->vaddr;
    if (send)
      memcpy(rdma->copy, (void*)(size_t)seg->vaddr, seg->len);
    seg->vaddr = (size_t)rdma->copy;
  }
#endif
  cc = mx__register(ep->handle, reg_area);
  if (cc != 0) {
    if (mx_errno == EIO) {
      mx_fatal("mx__register failed, check kernel logs for error messages");
    } else {
      MX_WARN(("mx__register_%s:%s\n", 
	       send ? "send":"recv",
	       mx_strerrno));
      mx_fatal("mx__register failed");
    }
  }
}

static void 
mx_rdmawin_release(struct mx_endpoint *ep, uint32_t rdma_id, int send, mx_reg_t *reg_area)
{
  struct mx__rdmawin *r;

  /* no regcache for multiple segments for now */
  if (!MX_NO_RNDV && mx__opt.rcache && reg_area->nsegs == 1) {
    MX__MUTEX_LOCK(&Mx_rcache_lock);
    if (TAILQ_EMPTY(&ep->rdmawin_free)) {
      r = TAILQ_LAST(&ep->rdmawin_pinned, s_rdmawin_pinned);
      TAILQ_REMOVE(&ep->rdmawin_pinned, r, list);
      mx_unpin(r->ep, r->rdma_id, r->send);
    } else {
      r = TAILQ_FIRST(&ep->rdmawin_free);
      TAILQ_REMOVE(&ep->rdmawin_free,r,list);
    }
    r->rdma_id = rdma_id;
    r->send = send;
    r->ep = ep;
    r->addr = (uintptr_t) reg_area->segs.vaddr;
    r->len = reg_area->segs.len;
    TAILQ_INSERT_HEAD(&ep->rdmawin_pinned, r, list);
    MX__MUTEX_UNLOCK(&Mx_rcache_lock);
  } else {
    mx_unpin(ep, rdma_id, send);
#if MX_OS_UDRV
    {
      struct mx_rdma_req *rdma;
      rdma = ep->rdma_requests + rdma_id;
      if (rdma->copy) {
	free(rdma->copy);
	rdma->copy = 0;
      }
    }
#endif
  }
}

/* Go through probes looking for one that is satisfied. Return the first
   one that is. */
struct mx__wait_queue *
mx__try_probe(mx_endpoint_t ep, uint32_t ctxid)
{
  struct mx__wait_queue *wq;
  union mx_request *req;
  struct mx__wait_queue_head * elt;

  MX__FOREACH_WAIT_QUEUE(wq, elt, &ep->ctxid[ctxid].probe_queue_head) {
    req = mx__endpoint_match_unexpected(ep, wq->type.probe.match_info, wq->type.probe.match_mask);
    if (req != NULL)
      return wq;
  }
  return NULL;
}

union mx_request *
mx__recv_complete(struct mx_endpoint *ep, union mx_request *r, mx_status_code_t status)
{
  uint64_t match_info = r->basic.status.match_info;
  uint32_t ctxid = CTXID_FROM_MATCHING(ep, match_info);

#if MX_DEBUG
  if (status == MX_SUCCESS && r->recv.basic.status.code == MX_SUCCESS)
    mx_assert(r->recv.basic.status.xfer_length != -1);
#endif

  MX__EP_STATS_INC(ep, completion);
  if (ep->in_progression_thread)
    MX__EP_STATS_INC(ep, overlapped_completion);    

  if (status == MX_STATUS_SUCCESS) {
    /* update receive buffers valgrind status */
    MX_VALGRIND_MEMORY_MAKE_SEGMENTS_READABLE(r->recv.segments,
					      r->recv.count,
					      r->recv.basic.status.xfer_length);
  }

  if (r->recv.count > 1) {
    /* release the segment list */
    mx_free(r->recv.segments);
  }

  if (!(r->recv.basic.state & MX__REQUEST_STATE_DEAD)) {
    r->recv.basic.state |= MX__REQUEST_STATE_COMPLETED;

    if (r->recv.basic.status.code == MX_SUCCESS) {
      /* only set the status if it is not already set to an error */
      if (status == MX_STATUS_SUCCESS) {
	/* override status code if there was a truncation problem */
	if (r->recv.basic.status.xfer_length < r->recv.basic.status.msg_length) {
	  r->recv.basic.status.code = 
	    mx__error_req(ep, "recv req", r, MX_STATUS_TRUNCATED);
	}
      } else {
	r->recv.basic.status.code = 
	  mx__error_req(ep, "recv req", r, status);
      }
    }

    if (r->recv.put_target) {
      union mx_request *large = r->basic.status.context;
      mx_fixme_assert(status == MX_STATUS_SUCCESS);
      if (large) {
	large->recv.accum += r->basic.status.xfer_length;
	if (large->recv.accum == large->basic.status.xfer_length) {
	  mx_fixme_assert(!(large->basic.state & MX__REQUEST_STATE_MCP));
	  mx_fixme_assert(!(large->basic.state & MX__REQUEST_STATE_SEND_QUEUED));
	  if (!(large->basic.state & MX__REQUEST_STATE_ACKED)) {
	  struct mx__partner_request_queue_head * head;
	    mx_fixme_assert (mx__opt.fw_ack == 0);

	    /* remove from the pending queue and that's it */
	    head = &MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(r->basic.partner, r->basic.send_seq);
	    mx__spliceout_partner_request(head, large);

	    mx__spliceout_request(&ep->resend_list, large);
	    mx__send_acked_and_mcp_complete(ep, large, MX_STATUS_SUCCESS);
	  }
	  mx__spliceout_request(&ep->large_getq, large);
	  mx__queue_liback(ep, large->basic.partner, 1);
	  mx__release_recv_large(ep, large, MX_STATUS_SUCCESS);
	  mx__recv_complete(ep, large, MX_STATUS_SUCCESS);
	}
      }
      mx__rl_free(ep, r);
      return NULL;
    } else {
      mx__notify_waiter_request_done(ep, r);
      mx__notify_peeker_request_done(ep, r, ctxid);
      return r;
    }
  } else {
    /* mx_forget was called */
    if (status != MX_STATUS_SUCCESS) {
      if (!ep->cancelled) {
	/* only handle errors when the endpoint is totally open */
	mx__error_req(ep, "send request (already completed)", r, status);
      }
    }
    mx__rl_free(ep, r);
    return NULL;
  }
}

#define MX__UNEXP_STORAGE_THRESHOLD (4000000)

static uint32_t
mx__virtual_unexp_length_too_big(mx_endpoint_t ep, uint32_t max)
{
  union mx_request *unexp;
  struct mx__request_queue_head *elt;
  uint32_t total;
  
  if (ep->ordered_unexp_length > max) {
    return 1;
  }
  total = 0;
  /* TODO Brice or Loic: fix this for context ids
   * and re-enable unexp_queue_length_max and context_id at the same time
   * in mx__process_params. */
  MX__FOREACH_REQ(unexp, elt, &ep->ctxid[0].unexpq) {
    uint32_t weight = 0;
    uint16_t unexp_seq = MX__SEQNO(unexp->recv.msg_seq);
    if (!mx__isempty_partner_request_queue(&unexp->basic.partner->partialq)) {
      uint16_t first_partial_seq = mx__first_partner_request(&unexp->basic.partner->partialq)->recv.msg_seq;
      /* (MX__QUADRANT_ONE *2) is the "sign bit" in the SEQNO range */
      if ((unexp_seq - first_partial_seq) & (MX__QUADRANT_ONE * 2)) {
	/* unexp_seq is before first_partial_seq */
	weight = unexp->basic.status.msg_length;
      }
    } else {
      /* no partial */
      weight = unexp->basic.status.msg_length;
    }
    unexp->recv.ordered_unexp_weight = weight;
    total += weight;
  }
  ep->ordered_unexp_length = total;
  if (mx__opt.verbose >= (total > max ? 1 : 2)) {
    mx_printf("unexp_vlength=%d, unexp_length=%d\n", total, ep->unexp_queue_length);
  }
  return (total > max);
}

static union mx_request *
mx__create_unexp_for_evt(mx_endpoint_t ep, uint8_t type, uint32_t msg_length,
			 uint64_t match_info, uint32_t ctxid)
{
  union mx_request * r;

  /* handle unexpected (TODO: A little like mx_irecv) */
  r = mx__rl_alloc(ep);
  if (r == NULL) {
    mx_printf_once("INFO: mx__create_unexp_for_evt:mx__rl_alloc failed\n");
    return NULL;
  }
  r->recv.basic.status.match_info = match_info;
  r->recv.basic.wq = NULL;
  r->recv.segments = &r->recv.segment;
  r->recv.count = 1;
  r->recv.memory_context = MX_PIN_LIBMX_CTX;
  r->recv.unexpected = 1;
  r->recv.put_target = 0;
  r->recv.basic.status.xfer_length = msg_length;
#if MX_ONE_SIDED
  if ((match_info & MX__ONESIDED_MASK) == MX__ONESIDED_PUT) {
    r->recv.segment.segment_ptr = MX_VA_TO_SEGMENT_PTR((void*)(uintptr_t)(match_info & ~MX__ONESIDED_MASK));
    r->recv.segment.segment_length = 0x7fffffff;
    r->recv.unexpected = 0;
    r->recv.put_target = 1;
    r->recv.r_length = 0x7fffffff;
    r->recv.basic.status.context = NULL;
  } else if ((match_info & MX__ONESIDED_MASK) == MX__ONESIDED_PUT_FAKE) {
    uint8_t rdma_id = (uint8_t)((match_info >> 32) & 0xff);
    union mx_request *large;
    large = ep->rdma_requests[rdma_id].req;
    mx_assert(large);
    r->recv.segment.segment_ptr = (char*)large->recv.segment.segment_ptr + (uint32_t)match_info;
    r->recv.segment.segment_length = 0x7fffffff;
    r->recv.unexpected = 0;
    r->recv.put_target = 1;
    r->recv.r_length = 0x7fffffff;
    r->recv.basic.status.context = large;
  } else
#endif
    if (type != MX_MCP_UEVT_RECV_RNDV && msg_length) {
      void *segment_ptr;

      if (ep->unexp_queue_length_max != 0
	  && ep->unexp_queue_length > MX__UNEXP_STORAGE_THRESHOLD
	  && mx__virtual_unexp_length_too_big(ep, ep->unexp_queue_length_max)) {
	if (mx__opt.verbose)
	  mx_printf_once("Warning: unexpected queue limit exceeded\n");
	mx__rl_free(ep, r);
	return NULL;
      }
      segment_ptr = mx_malloc(msg_length);
      if (segment_ptr == NULL) {
	mx_printf_once("Warning: mx__create_unexp_for_evt:mx_malloc failed\n");
	mx__rl_free(ep, r);
	return NULL;
      }
      r->recv.ordered_unexp_weight = 0;
      ep->unexp_queue_length += msg_length;
      r->recv.segment.segment_ptr = MX_VA_TO_SEGMENT_PTR(segment_ptr);
      r->recv.segment.segment_length = msg_length;
    } else {
      /* no data to store for 0-length messages or rdnv */
      r->recv.segment.segment_ptr = MX_VA_TO_SEGMENT_PTR(NULL);
      r->recv.segment.segment_length = msg_length;
    }
  r->recv.basic.state = MX__REQUEST_STATE_PENDING;
  if (type == MX_MCP_UEVT_RECV_RNDV) {
    r->recv.basic.type = MX__REQUEST_TYPE_RECV_LARGE;
  }
  else {
    r->recv.basic.type = MX__REQUEST_TYPE_RECV;
  }
  if (!r->recv.put_target) {
    mx__notify_prober_unexpected(ep, r, ctxid);
  } else {
    r->recv.basic.state |= MX__REQUEST_STATE_RECV_MATCHED;
  }

  return r;
}

static void
mx__process_recv_tiny(mx_endpoint_t ep, union mx_request *r, mcp_uevt_msg_t *evt, void *dummy)
{
  mcp_uevt_tiny_t *recv_tiny = (mcp_uevt_tiny_t *)evt;

  if (likely(r->recv.basic.status.xfer_length)) {
    if (likely(r->recv.count == 1)) {
      mx_memcpy_to_segment(r->recv.segments[0].segment_ptr, recv_tiny->data,
			   r->recv.basic.status.xfer_length,
			   r->recv.memory_context);
    } else {
      mx__copy_to_segments(r->recv.segments, r->recv.count, r->recv.memory_context,
			   0, (char *) recv_tiny->data, r->recv.basic.status.xfer_length);
    }
  }

  r->recv.basic.state |= MX__REQUEST_STATE_COMPLETED;
  if (likely(r->recv.unexpected == 0)) {
    mx__recv_complete(ep, r, MX_STATUS_SUCCESS);
  }
#if MX_ONE_SIDED
  else if ((r->recv.basic.status.match_info & MX__ONESIDED_MASK) == MX__ONESIDED_GET) {
    mx_segment_t seg;
    mx_request_t req;
    mx_return_t rc;
    mx__get_t *get;
    uint64_t match_info = r->recv.basic.status.match_info;
    uint32_t ctxid = CTXID_FROM_MATCHING(ep, match_info);

    get = (void*)recv_tiny->data;
    seg.segment_ptr = MX_VA_TO_SEGMENT_PTR((void*)(uintptr_t)get->data_target);
    seg.segment_length = get->length;
    ep->in_handler += 1;
    rc = mx__isend(ep, &seg, 1, MX_PIN_UNDEFINED, r->recv.basic.status.source, 
		  match_info | MX__ONESIDED_GET_REPLY, NULL, &req);
    ep->in_handler -= 1;
    mx_always_assert(rc == MX_SUCCESS);
    mx__spliceout_request(&ep->ctxid[ctxid].unexpq, r);
    mx_free((void*)(uintptr_t) r->recv.segment.segment_ptr);
    mx__rl_free(ep, r);
  }
#endif
}

static void
mx__process_recv_small(mx_endpoint_t ep, union mx_request *r, mcp_uevt_msg_t *evt, void *data)
{
  mx__copy_to_segments(r->recv.segments, r->recv.count,
		       r->recv.memory_context,
		       0, data, r->recv.basic.status.msg_length);
  r->recv.basic.state |= MX__REQUEST_STATE_COMPLETED;
  if (r->recv.unexpected == 0) {
    mx__recv_complete(ep, r, MX_STATUS_SUCCESS);
  }
}

union mx_request *
mx__received_last_frag(mx_endpoint_t ep, union mx_request *r,
		       int from_partial, mx_status_code_t status_code)
{
  if (from_partial) {
    mx__spliceout_partner_request(&r->basic.partner->partialq, r);
  }
  r->recv.basic.state |= MX__REQUEST_STATE_COMPLETED;
  if (r->recv.unexpected == 0) {
    if (from_partial) {
      mx__spliceout_request(&ep->multifrag_recvq, r);
    }
    return mx__recv_complete(ep, r, status_code);
  }
  return r;
}

static union mx_request *
mx__process_recv_copy_frag(mx_endpoint_t ep, union mx_request *r, mcp_uevt_msg_t *evt, void *data, int from_partial)
{
  uint32_t frame_seqnum, frame_length;
  mcp_uevt_medium_t *recv_eager = (mcp_uevt_medium_t *)evt;

  frame_seqnum = recv_eager->frame_seqnum;
  frame_length = ntohs(recv_eager->frame_length);
  
  if (r->recv.r_mask & (1 <<frame_seqnum)) {
    /* duplicate */
    return r;
  }
  mx__copy_to_segments(r->recv.segments, r->recv.count,
		       r->recv.memory_context,
		       (1 << recv_eager->pipeline)
		       * frame_seqnum, data, frame_length);
  mx_dcbf(data, frame_length);
  
  r->recv.accum += frame_length;
  r->recv.r_mask |= (1 << frame_seqnum);
  mx_assert(r->recv.accum <= r->recv.basic.status.msg_length);
  if (r->recv.accum == r->recv.basic.status.msg_length) {
    return mx__received_last_frag(ep, r, from_partial, MX_STATUS_SUCCESS);
  }
  return r;
}

static void
mx__process_recv_medium(mx_endpoint_t ep, union mx_request *r, mcp_uevt_msg_t *evt, void *data)
{
  mcp_uevt_msg_t *recv_eager = evt;
  r->recv.accum = 0;
  r->recv.r_mask = 0;
  r = mx__process_recv_copy_frag(ep, r, recv_eager, data, 0);
  if (r && !(r->recv.basic.state & MX__REQUEST_STATE_COMPLETED)) {
    if (r->recv.unexpected == 0) {
      mx__enqueue_request(&ep->multifrag_recvq, r);
    }
    /* Message are processed in order so partialq is ordered */
    mx__enqueue_partner_request(&r->basic.partner->partialq, r);
  }
}


static void
mx__process_recv_large(mx_endpoint_t ep, union mx_request *r, mcp_uevt_msg_t *evt, void *dummy)
{
  mcp_uevt_tiny_t *recv_tiny = (mcp_uevt_tiny_t *)evt;

  /* the following field is only used for big messages.
   * the first 4 bytes have already been used to get the length
   * in mx__get_req_for_evt()
   */
  r->recv.remote_rdma = *(uint32_t *) (recv_tiny->data + 4);
  if (!r->recv.unexpected) {
    mx__queue_large_recv(ep, r);
  }
}

#if MX_NO_RNDV
static void
mx__fake_put(mx_endpoint_t ep, union mx_request *s)
{
  union mx_request *q;
  q = mx__rl_alloc(ep);
  mx_fixme_assert(q);
  uint32_t length;

  length = s->basic.status.xfer_length - s->send.frag_off;
  length = MX_MIN(length, ep->medium_msg_threshold);
  q->send.segment.segment_ptr = (char*)s->send.segments[0].segment_ptr + s->send.frag_off;
  q->send.segment.segment_length = length;
  q->send.segments = &q->send.segment;
  q->send.count = 1;
  q->basic.partner = s->basic.partner;
  q->send.basic.status.match_info = (MX__ONESIDED_PUT_FAKE | 
				     ((uint64_t)s->send.peer_rdma << 32) |
				     s->send.frag_off);
  q->send.basic.status.source = s->basic.status.source;
  q->send.basic.state = MX__REQUEST_STATE_SEND_QUEUED | MX__REQUEST_STATE_INTERNAL;
  if (length <= ep->tiny_msg_threshold) {
    q->send.basic.type = MX__REQUEST_TYPE_SEND_TINY;
  } else if (length <= ep->small_msg_threshold) {
    q->send.basic.type = MX__REQUEST_TYPE_SEND_SMALL;
  } else if (length <= ep->medium_msg_threshold) {
    q->send.basic.type = MX__REQUEST_TYPE_SEND_MEDIUM;
  } else {
    mx_fatal("should never be there");
  }
  q->send.basic.wq = NULL;
  q->send.basic.status.msg_length = length;
  q->send.memory_context = MX_PIN_UNDEFINED;
  q->send.basic.requeued = 0;
  q->send.basic.status.context = s;
  mx__enqueue_request(&ep->send_reqq, q);
  s->send.frag_off += length;
}
#endif

#if MX_NO_RNDV
static void
mx__process_recv_cts(mx_endpoint_t ep, union mx_request *dummy, mcp_uevt_msg_t *evt, void *dummy1)
{
  union mx_request *s;
  int i;

  union mx__lib2lib *lib2lib = (void*)((char *)evt - offsetof(union mx__lib2lib,cts.evt));
  uint8_t rdma_id = *(uint8_t*)&lib2lib->cts.send_rdma;

  mx_assert(rdma_id < (unsigned)ep->rdmas.max);
  s = ep->rdma_requests[rdma_id].req;
  mx_assert(s && s->send.reg_area.rdma_id == rdma_id);
  s->basic.status.xfer_length = ntohl(lib2lib->cts.xfer_length);
  s->send.peer_rdma = lib2lib->cts.recv_rdma;
  s->send.frag_off = 0;
  s->send.accum = 0;
  i = 0;
  do {
    mx__fake_put(ep, s);
    i += 1;
  } while (s->send.frag_off != s->basic.status.xfer_length && i < 6);

}
#endif

struct mx__fake_notify_evt {
  mcp_uevt_msg_t msg;
  uint32_t length_n;
  uint8_t rdma_id;
  uint8_t rdma_seqnum;
};

static void
mx__process_recv_notify(mx_endpoint_t ep, union mx_request *dummy, mcp_uevt_msg_t *evt, void *dummy1)
{
  struct mx__fake_notify_evt *notif = (struct mx__fake_notify_evt *)evt;
  mx__rndv_got_notify(ep, notif->rdma_id, ntohl(notif->length_n), notif->rdma_seqnum);
}

static void
mx__obsolete(mx_endpoint_t ep, struct mx__partner *partner, 
	     int order, int msg_seqnum)
{
  mx_always_assert(order < 0);
  /* assume an ack has been lost */
  mx__queue_liback(ep, partner, 1);
  if (mx__opt.verbose) {
    if (MX__SESNO(msg_seqnum ^ partner->recv_seq)) {
      mx_printf("mx__process_recv:received message from previous session\n");
    } else {
      mx_printf_once("INFO:mx__process_recv:Redundant obsolete message"
		     "(msg=0x%x,partner=0x%x,fully=0x%x,order=%d)..OK\n",
		     msg_seqnum, partner->recv_seq, partner->fully_recv_seq, order / 4);
    }
  }
}

static int inline
mx__process_ordered_evt(mx_endpoint_t ep, struct mx__partner *partner,
			int msg_seqnum, int type, mcp_uevt_msg_t *evt,
			void *data, mx__process_recv_msg_t recv_func)
{
  union mx_request *r = NULL;

  mx_assert(type != MX__LIB2LIB_RNDV);
  if (type <= MX__LIB2LIB_RNDV) {
    uint64_t match_info;
    uint32_t match_a, match_b, msg_length;
    uint32_t ctxid;
    void * unexp_handler_data = data;
    uint32_t unexp_handler_discard = 0;

    match_a = ntohl(evt->match_a);
    match_b = ntohl(evt->match_b);
    match_info = (uint64_t)match_a << 32 | match_b;
    ctxid = CTXID_FROM_MATCHING(ep, match_info);

    if (type == MX_MCP_UEVT_RECV_RNDV)
      /* get the length from the inline data for rdnv messages */
      msg_length = ntohl(*(uint32_t*) (((mcp_uevt_tiny_t *) evt)->data));
    else
      msg_length = ntohs(evt->length);

    /* find the data if the message has been received */
    if (type == MX_MCP_UEVT_RECV_TINY) {
      /* use inline data */
      unexp_handler_data = ((mcp_uevt_tiny_t *) evt)->data;
    } else if (type == MX_MCP_UEVT_RECV_SMALL) {
      /* data pointer is already ok */
    } else if (type == MX_MCP_UEVT_RECV_MEDIUM
	       && msg_length == ntohs (((mcp_uevt_medium_t *) evt)->frame_length)) {
      /* data is pointing to the fragment and this fragment is the entire message */
    } else {
      /* data is not available in other cases */
      unexp_handler_data = NULL;
    }

    r = mx__endpoint_match_receive(ep, partner, match_info, msg_length,
				   unexp_handler_data, &unexp_handler_discard);
    if (unexp_handler_discard) {
      /* this message has been processed by the unexpected handler, do nothing else */
      partner->recv_seq = MX__SESNO(partner->recv_seq) | MX__SEQNO(partner->recv_seq + 1);
      return 0;
    }
    if (r == NULL) {
      /* no receive matched, store the message as an unexpected */
      r = mx__create_unexp_for_evt(ep, type, msg_length, match_info, ctxid);
      if (r == NULL)
	return -1;
    }
    else {
      /* r matched this message */
      mx_assert(r->recv.unexpected == 0);
      r->recv.basic.state |= MX__REQUEST_STATE_RECV_MATCHED;
      /* Matched, take out of recv_reqq. */
      mx__spliceout_request(&ep->ctxid[ctxid].recv_reqq, r);
      /* Compute the xfer_length now for later use */
      r->recv.basic.status.xfer_length = MX_MIN(msg_length, r->recv.r_length);      
    }
    r->recv.basic.status.msg_length = msg_length;
    r->recv.basic.status.match_info = match_info;
    mx__partner_to_addr(partner, &r->recv.basic.status.source);
    r->basic.partner = partner;
    r->recv.msg_seq = msg_seqnum;
  }
  (*recv_func)(ep, r, evt, data);
  partner->recv_seq = MX__SESNO(partner->recv_seq) | MX__SEQNO(partner->recv_seq + 1);
  return 0;
}

void
mx__process_early(mx_endpoint_t ep, struct mx__partner *partner)
{
  struct mx__early *early;
  int order;
  union mx_request *r;
  mcp_uevt_msg_t *evt;

  while (!mx__isempty_partner_early_queue(partner)) {
    early = mx__partner_first_early(partner);
    order = mx__msg_order(early->msg_seq, partner->recv_seq);
    if (order > 0)
      return;
    evt = (mcp_uevt_msg_t*) &early->recv_tiny;
    if (order == 0) {
      if (mx__process_ordered_evt(ep, partner, early->msg_seq, early->type, evt, early->data,
				  early->recv_func))
	return;
    } else {
      r = mx__endpoint_match_partial_request(ep, early->msg_seq, partner);
      if (r) {
	/* the only case of late message in the early queue are other
	   fragments of the message just taken into account */
	mx_assert(MX__SEQNO(partner->recv_seq) == MX__SEQNO(early->msg_seq + 1));
	mx_assert(early->type == MX_MCP_UEVT_RECV_MEDIUM);
	mx_assert(early->recv_func == mx__process_recv_medium);
	mx__process_recv_copy_frag(ep, r, evt, early->data, 1);
      } else {
	mx__obsolete(ep, partner, order, early->msg_seq);
      }
    }
    mx__partner_drop_early(early);
  }
}

/* returns 0 on success, 1 when processing must be stopped in the caller */
static int inline
mx__process_recvs(mx_endpoint_t ep, int type, mcp_uevt_msg_t *evt, void *data,
		  mx__process_recv_msg_t recv_func)
{
  int order;
  struct mx__partner *partner;
  union mx_request *r;
  uint32_t msg_seqnum;
  uint16_t src_peer_index;
  msg_seqnum = ntohs(evt->lib_seqnum);
  src_peer_index = ntohs(evt->src_peer_index);
  if (unlikely(src_peer_index == (uint16_t) MX_UNKNOWN_SRC_PEER_INDEX)) {
    if (mx__opt.verbose >= 1)
      mx_printf("received message type %d with unknown source peer index\n", type);
    return 0;
  }
  partner = mx__endpoint_lookup_partner(ep, evt->src_endpt, src_peer_index);
  order = mx__msg_order(msg_seqnum, partner->recv_seq);
  if (mx__opt.fw_ack == 0)
    mx__handle_liback(ep, partner, MX__SEQNO(htons(evt->lib_piggyack)));
  if (order == 0) {
    if (mx__process_ordered_evt(ep, partner, msg_seqnum, type, evt, data, recv_func))
      return 1;
    mx__process_early(ep, partner);
    mx__liback(ep, partner);
    return 0;
  } else if (order < 0 && data && 
	     (r = mx__endpoint_match_partial_request(ep, msg_seqnum, partner))) {
    mx_assert(type == MX_MCP_UEVT_RECV_MEDIUM);
    mx__process_recv_copy_frag(ep, r, evt, data, 1);
    mx__liback(ep, partner);
    return 0;
  } else  if (order > 0) {
    if (ep->unexp_queue_length_max
	&& !mx__isempty_partner_early_queue(partner)
	&& ntohs(mx__partner_first_early(partner)->recv_tiny.lib_seqnum)
	     == partner->recv_seq) {
      /* don't insert early if we stopped processing the early list */
      mx_assert(mx__virtual_unexp_length_too_big(ep, ep->unexp_queue_length_max));
      return 1;
    }
    mx_assert(mx__isempty_partner_early_queue(partner)
	      || ntohs(mx__partner_first_early(partner)->recv_tiny.lib_seqnum)
		 != partner->recv_seq);
      
    MX__EP_STATS_INC(ep, early);
    if (mx__partner_insert_early(partner, evt,
				 msg_seqnum, 
				 recv_func,
				 type, data) == NULL) {
      mx_fatal("mx__partner_insert_early failed");
      return 1;
    }
    return 0;
  } else {
    mx__obsolete(ep, partner, order, msg_seqnum);
    return 0;
  }
}

void
mx__send_complete(mx_endpoint_t ep, union mx_request *r, mx_status_code_t status)
{
#if MX_DEBUG
  if (status == MX_SUCCESS && r->send.basic.status.code == MX_SUCCESS)
    mx_assert(r->send.basic.status.xfer_length != -1);
#endif

  MX__EP_STATS_INC(ep, completion);
  if (ep->in_progression_thread)
    MX__EP_STATS_INC(ep, overlapped_completion);    

  if (r->send.count > 1) {
    /* release the segment list */
    mx_free(r->send.segments);
  }

  if (!(r->send.basic.state & MX__REQUEST_STATE_DEAD)) {
    uint64_t match_info = r->send.basic.status.match_info;
    uint32_t ctxid = CTXID_FROM_MATCHING(ep, match_info);
    /* The request is complete, but the user hasn't tested or waited
       on it yet. */
    mx_assert(!(r->send.basic.state & MX__REQUEST_STATE_MCP));
    mx_assert(!(r->send.basic.state & MX__REQUEST_STATE_SEND_QUEUED));
    r->send.basic.state |= MX__REQUEST_STATE_COMPLETED;
    /* TODO: Use generic status translation function/macro. */

    if (r->send.basic.status.code == MX_SUCCESS) {
      /* only set the status if it is not already set to an error */
      if (status == MX_STATUS_SUCCESS) {
	if (r->send.basic.status.xfer_length < r->send.basic.status.msg_length) {
	  r->send.basic.status.code = 
	    mx__error_req(ep, "send req", r, MX_STATUS_TRUNCATED);
	}
      } else {
	r->send.basic.status.code = 
	  mx__error_req(ep, "send req", r, status);
      }
    }

#if MX_ONE_SIDED
    if (match_info & MX__ONESIDED_MASK) {
      if ((match_info & MX__ONESIDED_MASK) == MX__ONESIDED_GET) {
	mx__get_t *get = r->basic.status.context;
	if (get) {
	  mx_free(get);
	}
      } 
#if MX_NO_RNDV
      else if ((match_info & MX__ONESIDED_MASK) == MX__ONESIDED_PUT_FAKE) {
	union mx_request *large = r->basic.status.context;
	mx_fixme_assert(status == MX_STATUS_SUCCESS);
	large->send.accum += r->basic.status.xfer_length;
	if (large->send.accum == large->basic.status.xfer_length) {
	  unsigned seqno = ep->rdma_requests[large->send.local_rdma_id].seqno;
	  mx_assert(large->send.frag_off == large->basic.status.xfer_length);
	  mx_assert(large == ep->rdma_requests[large->send.local_rdma_id].req);
	  mx__rndv_got_notify(ep, large->send.local_rdma_id, large->basic.status.xfer_length, seqno);
	} else if (large->send.frag_off < large->basic.status.xfer_length) {
	  mx__fake_put(ep, large);
	}
      }    
#endif
      /* default case of ONE_SIDED messages */
      if ((match_info & MX__ONESIDED_MASK) != MX__ONESIDED_PUT) {
	/* PUT is the only case whose send completion is notified */
	mx__rl_free(ep, r);
	return;
      }
    }
#endif
    mx__notify_waiter_request_done(ep, r);
    mx__notify_peeker_request_done(ep, r, ctxid);
  }
  else {
    /* Someone already tested or waited on handle. The request is
       complete. Or mx_forget was called. */
    if (status != MX_STATUS_SUCCESS) {
      if (!ep->cancelled) {
	/* only handle errors when the endpoint is totally open */
	mx__error_req(ep, "send req(already completed)", r, status);
      }
    }
    ep->dead_count -= 1;
    mx__rl_free(ep, r);
  }
}

static void
mx__release_send_medium(struct mx_endpoint *ep, union mx_request *r)
{
  mx__memory_pool_free(ep->send_pool, ep->sendq 
		       + (r->send.offset_data << MX_MCP_VPAGE_SHIFT),
		       ep->medium_msg_threshold);
}

static void
mx__release_recv_large(struct mx_endpoint *ep, union mx_request *r, 
		       mx_status_code_t status)
{
#if MX_OS_UDRV
  {
    struct mx_rdma_req *rdma;
    rdma = ep->rdma_requests + r->recv.local_rdma_id;
    if (rdma->copy) {
      memcpy((char*)r->recv.segment.segment_ptr, 
	     (char*)r->recv.segment.segment_ptr + rdma->copy_offset,
	     r->recv.basic.status.xfer_length);
    }
  }
#endif
  if (MX_NO_RNDV || r->basic.status.xfer_length) {
    mx_rdmawin_release(ep, r->recv.local_rdma_id, 0, &r->recv.reg_area);
    if (r->recv.contiguous_copy
	&& (status == MX_STATUS_SUCCESS || status == MX_STATUS_TRUNCATED)) {
      mx__copy_to_segments(r->recv.segments, r->recv.count, r->recv.memory_context,
			   0, r->recv.contiguous_copy, r->basic.status.xfer_length);
      mx_free(r->recv.contiguous_copy);
    } else if (r->recv.reg_area.nsegs > 1)
      mx_free((void*)(uintptr_t)r->recv.reg_area.segs.vaddr);
    ep->rdma_requests[r->recv.local_rdma_id].req = 0;
  }
}

/* release large send resources, to be called once the notify
 * has arrived (or we aborted the request)
 */
static void
mx__release_send_large(mx_endpoint_t ep, union mx_request * req)
{
  unsigned rdma_id = req->send.local_rdma_id;
  ep->rdma_requests[rdma_id].seqno += 1;
  mx_rdmawin_release(ep, rdma_id, 1, &req->send.reg_area);
  if (req->send.contiguous_copy)
    mx_free(req->send.contiguous_copy);
  else if (req->send.reg_area.nsegs > 1)
    mx_free((void*)(uintptr_t) req->send.reg_area.segs.vaddr);
  ep->rdma_requests[rdma_id].req = NULL;
}

static void
mx__send_acked_and_mcp_complete(mx_endpoint_t ep, union	mx_request *r, mx_status_code_t status)
{
  mx_assert(r->basic.partner->quadrant_count[MX__QUADRANT(r->basic.send_seq)] > 0);
  r->basic.partner->quadrant_count[MX__QUADRANT(r->basic.send_seq)] -= 1;

  if (r->basic.type == MX__REQUEST_TYPE_SEND_MEDIUM) {
    mx__release_send_medium(ep, r);
  }

  if (r->basic.type == MX__REQUEST_TYPE_RECV_LARGE) {
#if MX_NO_RNDV
    mx__enqueue_request(&ep->large_getq, r);
#else
    mx__release_recv_large(ep, r, status);
    mx__recv_complete(ep, r, status);
#endif

  } else if (r->send.basic.type == MX__REQUEST_TYPE_SEND_LARGE
	     && !(r->send.basic.state & MX__REQUEST_STATE_REPLIED)) {
    if (status == MX_SUCCESS)
      mx__enqueue_request(&ep->notifying_large_sendq, r);
    else {
      mx__release_send_large(ep, r);
      mx__send_complete (ep, r, status);
    }

  } else if (r->basic.type != MX__REQUEST_TYPE_CONNECT) {
    /* either less than large completion or
       the uncommon case of large send completion with scout-ack
       arriving after the notification
    */
    mx__send_complete(ep, r, status);
  }
}

/* Takes a request already removed from the pending list of the partner, 
 * marks it acked and complete it if already back from MCP
 */
static void
mx__mark_request_acked(mx_endpoint_t ep, struct mx__partner *partner, union mx_request * r,
		       struct mx__request_queue_head * queue, mx_status_code_t status)
{
  mx_assert(!(r->basic.state & MX__REQUEST_STATE_ACKED));
  r->basic.state |= MX__REQUEST_STATE_ACKED;

  if (!(r->basic.state & MX__REQUEST_STATE_MCP)) {
    mx__spliceout_request(queue, r);
    r->send.basic.state &= ~MX__REQUEST_STATE_SEND_QUEUED; /* in case it was on send_reqq or resend_reqq */
    mx__send_acked_and_mcp_complete(ep, r, status);
  }
}

static void
mx__handle_liback(mx_endpoint_t ep, struct mx__partner *partner, uint16_t liback)
{
  mx_jiffies_t now = mx_jiffies(ep);
  uint32_t non_acked = MX__SEQNO(partner->send_seq - partner->send_acked);
  mx_assert(non_acked < MX__SEQNO_CNT / 2);
  if (MX__SEQNO(liback - partner->send_acked) > non_acked) {
    /* FIXME */
    if (mx__opt.verbose >= 2)
      mx_printf("duplicate ack, send_acked=%x,liback=0x%x,send_seq=%x\n",
		partner->send_acked, liback, partner->send_seq);
    return;
  }
  while (liback != partner->send_acked) {
    union mx_request * r;
    struct mx__partner_request_queue_head * pendingq;
    unsigned ackno = partner->send_acked;

    partner->send_acked += 1;
    if (partner->send_acked >= MX__SEQNO_CNT)
      partner->send_acked = 0;
    partner->last_ack = now;

    pendingq = &partner->pendingq;
    if (mx__isempty_partner_request_queue(pendingq))
      continue;

    /* the request should be the first in the queue since we mark as acked in order */
    r = mx__first_partner_request(pendingq);
    if (MX__SEQNO(r->basic.send_seq) != ackno) {
      /* request might have gotten a nack or reply */
      continue;
    }

    mx__spliceout_partner_request(pendingq, r);

    /* either it's queued (in resend_reqq), or pending (in resend_list),
     * or mcp queued
     * make sure we drop remove it from the queue in three cases
     */
    mx__mark_request_acked(ep, partner, r, r->basic.state & MX__REQUEST_STATE_SEND_QUEUED
			   ? &ep->resend_reqq : &ep->resend_list, MX_STATUS_SUCCESS);
  }
}

static void
mx__handle_liback_evt(mx_endpoint_t ep, mcp_uevt_truc_t *evt)
{
  struct mx__partner * partner;
  union mx__lib2lib *lib2lib;
  uint16_t lib_seqnum;
  uint16_t liback;
  uint16_t src_peer_index;

  lib2lib = MX__LIB2LIB(evt);
  lib_seqnum = htons(lib2lib->ack.lib_seqnum);
  liback = MX__SEQNO(lib_seqnum);

  src_peer_index = ntohs(evt->src_peer_index);
  if (unlikely(src_peer_index == (uint16_t) MX_UNKNOWN_SRC_PEER_INDEX)) {
    if (mx__opt.verbose >= 1)
      mx_printf("received liback with unknown source peer index\n");
    return;
  }
  partner = mx__endpoint_lookup_partner(ep, evt->src_endpt,
					src_peer_index);
  mx_fixme_assert(partner);
  if (ep->endpoint_sid_n != lib2lib->ack.session_id
      || MX__SESNO(partner->send_seq) !=  MX__SESNO(lib_seqnum)) {
    /* FIXME: should not happen since the MCP tests the session in the header of the truc message,
     * but we've seen it at least once... */
    if (mx__opt.verbose)
      mx_printf("received invalid liback:sid=0x%x(expect0x%x),"
		"send_seq=0x%x(my_recv_seq=0x%x)\n",
		lib2lib->ack.session_id, ep->endpoint_sid_n,
		partner->send_seq, lib_seqnum);
    return;
  }
  if (htonl(lib2lib->ack.acknum) <= partner->recv_acknum && mx__opt.verbose) {
    mx_printf("ack reordered on network\n");
    return;
  }
  partner->recv_acknum = htonl(lib2lib->ack.acknum);
#if MX_DEBUG
  if (lib2lib->ack.requeued) {
    mx__print_partner(partner);
    mx_printf("requeued=%d, send_seq=0x%x\n", lib2lib->ack.requeued, ntohs(lib2lib->ack.send_seq));
    mx_fatal("Sender aborting");
  }
#endif
  mx__handle_liback(ep, partner, liback);
}

static void
mx__rndv_got_notify(mx_endpoint_t ep, unsigned rdma_id, uint32_t length, unsigned seqno)
{
  union mx_request *r;

  r = ep->rdma_requests[rdma_id].req;
  mx_assert(r);
  mx_assert(r->basic.type == MX__REQUEST_TYPE_SEND_LARGE);
  mx_assert(rdma_id == r->send.local_rdma_id);
  mx_assert(seqno == ep->rdma_requests[rdma_id].seqno);

  r->send.basic.status.xfer_length = length;
  r->basic.state |= MX__REQUEST_STATE_REPLIED;
  mx__release_send_large(ep, r);

  if (!(r->basic.state & MX__REQUEST_STATE_ACKED)) {
    if (mx__opt.fw_ack == 0) {
      /* remove from the pending queue */
      mx__spliceout_partner_request(&MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(r->basic.partner, seqno), r);
    }
    r->basic.state |= MX__REQUEST_STATE_ACKED;
    if (!(r->basic.state & (MX__REQUEST_STATE_MCP | MX__REQUEST_STATE_SEND_QUEUED))) {
      mx__spliceout_request(&ep->resend_list,r);
      mx__send_acked_and_mcp_complete(ep, r, MX_STATUS_SUCCESS);
    }
    mx__queue_liback(ep, r->basic.partner, 1);
  } else if (!(r->basic.state & (MX__REQUEST_STATE_MCP | MX__REQUEST_STATE_SEND_QUEUED))) {
    /* common case of large send completion, scout is completed mcp+acked when notif arrives */
    mx__spliceout_request(&ep->notifying_large_sendq, r);
    mx__queue_liback(ep, r->basic.partner, 1);
    mx__send_complete (ep, r, MX_STATUS_SUCCESS);
  } else {
    /* wait for requeued or mcp to complete ongoing op before going further */
  }
}

/* Aborts a request and sets its status.
 * If the request is in the MCP, the actual abortion will occur when
 * the MCP send completion event will arrive in process_events.
 * See the bottom of mx__partner.c for documentation about disconnection and so
 */
int
mx__abort_sent_request(struct mx_endpoint * ep, struct mx__partner * partner,
		       union mx_request * r, mx_status_code_t status_code)
{
  if (r->basic.type == MX__REQUEST_TYPE_SEND_LARGE
      && !(r->basic.state & MX__REQUEST_STATE_MCP)
      && (r->basic.state & MX__REQUEST_STATE_ACKED)) {
    /* large send, scout completed by the mcp and acked, waiting for the notify.
     * complete and release resources */
    mx__spliceout_request(&ep->notifying_large_sendq, r);
    mx__release_send_large(ep, r);    
    mx__send_complete (ep, r, status_code);

  } else if (r->basic.type == MX__REQUEST_TYPE_SEND_TINY
	     || r->basic.type == MX__REQUEST_TYPE_SEND_SMALL
	     || r->basic.type == MX__REQUEST_TYPE_SEND_MEDIUM
	     || r->basic.type == MX__REQUEST_TYPE_SEND_LARGE) {
    /* send failed */
    if (r->send.basic.state & MX__REQUEST_STATE_MCP) {
      /* report the status and let process events deal with it when
       * the request comes back from the MCP */
      r->basic.status.code = status_code;
    } else if (!r->basic.requeued
	        && (r->send.basic.state & MX__REQUEST_STATE_SEND_QUEUED)) {
      /* abort before the request got sent (and resources allocated) */
      mx__spliceout_request(&ep->send_reqq, r);
      r->send.basic.state &= ~MX__REQUEST_STATE_SEND_QUEUED; /* required for mx__send_complete */
      mx__send_complete(ep, r, status_code);
    } else {
      /* the request has been sent at least one, behave as if we got a ack */
      struct mx__partner_request_queue_head * pendingq;
      struct mx__request_queue_head * queue = 
	(r->send.basic.state & MX__REQUEST_STATE_SEND_QUEUED)
	  ? &ep->resend_reqq : &ep->resend_list;

      pendingq = &MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(partner, r->basic.send_seq);
      mx__spliceout_partner_request(pendingq, r);
      mx__mark_request_acked(ep, partner, r, queue, status_code);
    }

  } else if (r->basic.type == MX__REQUEST_TYPE_RECV_LARGE) {
    /* recv large failed */

    /* in case of nack during the GET, we won't get here, see UEVT_ERROR in process_events() */

    if (r->recv.basic.state & MX__REQUEST_STATE_MCP) {
      /* report the status and let process events deal with it when
       * the request comes back from the MCP */
      r->basic.status.code = status_code;
    } else if (!r->basic.requeued
	       && (r->recv.basic.state & MX__REQUEST_STATE_SEND_QUEUED)
	       && !r->recv.notifying) {
      /* abort before the resources have been allocated */
      mx__spliceout_request(&ep->send_reqq, r);
      mx__recv_complete(ep, r, status_code);
    } else {
      /* resources have been allocated, behave as if we got a ack for the notify */
      struct mx__partner_request_queue_head * pendingq;
      struct mx__request_queue_head * queue = 
	(r->recv.basic.state & MX__REQUEST_STATE_SEND_QUEUED)
	  ? ((r->basic.requeued) ? &ep->resend_reqq : &ep->send_reqq)
	  : &ep->resend_list;

      pendingq = &MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(partner, r->basic.send_seq);
      mx__spliceout_partner_request(pendingq, r);
      mx__mark_request_acked(ep, partner, r, queue, status_code);
    } 

  } else if (r->basic.type == MX__REQUEST_TYPE_CONNECT) {
    /* connect failed, report the status */
    if (r->send.basic.state & MX__REQUEST_STATE_MCP) {
      /* report the status and let process events deal with it when
       * the request comes back from the MCP */
      r->connect.basic.status.code = status_code;
    } else {
      /* complete the request as if we got a connect reply */
      struct mx__request_queue_head * queue = 
	(r->send.basic.state & MX__REQUEST_STATE_SEND_QUEUED)
	  ? ((r->basic.requeued) ? &ep->resend_reqq : &ep->send_reqq)
	  : &ep->resend_list;
      r->connect.basic.state |= MX__REQUEST_STATE_REPLIED;
      mx__spliceout_request(queue, r);
      /* nothing is allocated with this request,
       * just call mx__connect_complete() in all cases. */
      mx__connect_complete(ep, r, status_code);
    }

  } else if (r->basic.type == MX__REQUEST_TYPE_CONNECT_REPLY) {
    /* connect reply failed */
    if (!(r->send.basic.state & MX__REQUEST_STATE_MCP)) {
      /* forget about it */
      struct mx__request_queue_head * queue =
	(r->send.basic.state & MX__REQUEST_STATE_SEND_QUEUED) ?
	&ep->send_reqq : &ep->mcp_connect_replyq;
      mx__spliceout_request(queue, r);
      /* nothing is allocated with this request */
      mx__rl_free(ep, r);
    }

  } else if (r->basic.type == MX__REQUEST_TYPE_LIBACK) {
    /* liback failed */
    if (!(r->send.basic.state & MX__REQUEST_STATE_MCP)) {
      /* forget about it */
      r->basic.partner->liback_pending = NULL;
      ep->liback_count -= 1;
      mx__spliceout_request(&ep->resend_reqq, r);
      /* nothing is allocated with this request */
      mx__rl_free(ep, r);
    }

  } else {
    /* don't know how to abort this request, let the caller report the right problem */
    return -1;
  }

  return 0;
}

static int
mx__process_events(struct mx_endpoint * ep, unsigned int event_count)
{
  unsigned int i, recvq_offset;
  union mx_request *r;
  /* TODO: Should mcp_event be volatile? */
  /* EVENT: uevt_recv_eager */
  mcp_uevt_t *mcp_event;
  uint8_t type;
  uint16_t mcp_handle;

  i = 0;
  do {
    mcp_event = (mcp_uevt_t *) ep->eventq_uevt;
    type = mcp_event->basic.type;
    if (type == 0)
      return 0;
    MX_READBAR();
    switch(type) {
#if MX_DRIVER_API_MAGIC < 0x500
#define MX_MCP_UEVT_DONE MX_MCP_UEVT_DONE_SUCCESS
#endif
    case MX_MCP_UEVT_DONE:
      mcp_handle =  ntohs(mcp_event->done.uevt.lib_cookie);
      r = mx__hm_get(ep->handle_map, mcp_handle);

      mx_assert(r->basic.state & MX__REQUEST_STATE_MCP);
      mx_assert(!(r->basic.state & MX__REQUEST_STATE_SEND_QUEUED));
      r->basic.state &= ~MX__REQUEST_STATE_MCP;
      r->basic.state |= mx__opt.fw_ack;

      /* Return resources. Buffered sends no longer have to be waited on
	 at endpoint close. */
      switch (r->send.basic.type) {
      case MX__REQUEST_TYPE_SEND_TINY:
	mx_assert(r->basic.state & MX__REQUEST_STATE_BUFFERED);
	/* Take out of buffered_senq. */
	mx__spliceout_request(&ep->buffered_sendq, r);
	break;
      case MX__REQUEST_TYPE_SEND_SMALL:
	mx_assert(r->basic.state & MX__REQUEST_STATE_BUFFERED);
	if (r->send.basic.status.msg_length > 0) {
	  mx__ptr_stack_push(ep->small_msg_ptrs,
			     ep->udataq + r->send.offset_data);
	}
	/* Take out of buffered_senq. */
	mx__spliceout_request(&ep->buffered_sendq, r);
	break;
      case MX__REQUEST_TYPE_SEND_MEDIUM:
	mx_assert(r->basic.state & MX__REQUEST_STATE_BUFFERED);
	/* Take out of buffered_sendq. */
	mx__spliceout_request(&ep->buffered_sendq, r);
	break;
      case MX__REQUEST_TYPE_SEND_LARGE:
	mx__spliceout_request(&ep->large_sendq, r);
	break;
      case MX__REQUEST_TYPE_RECV_LARGE:
	mx__spliceout_request(&ep->large_getq, r);
#if !MX_NO_RNDV
	if (!r->recv.notifying) {
	  r->recv.notifying = 1;
	  r->basic.requeued = 0;
	  r->basic.state |= MX__REQUEST_STATE_SEND_QUEUED;
	  mx__enqueue_request(&ep->send_reqq, r);
	  goto finish_send_processing;
	} else {
	  break;
	}
#else
	break;
#endif
      case MX__REQUEST_TYPE_CONNECT:
	mx__spliceout_request(&ep->mcp_connectq, r);
	if (r->basic.state & MX__REQUEST_STATE_REPLIED) {
	  mx__connect_complete(ep, r, MX_SUCCESS);
	} else {
	  mx__enqueue_request(&ep->resend_list, r);
	  r->basic.last_send_time = mx_jiffies(ep);
	}
	goto finish_send_processing;
      case MX__REQUEST_TYPE_CONNECT_REPLY:
	mx__spliceout_request(&ep->mcp_connect_replyq, r);
	mx__rl_free(ep, r);
	goto finish_send_processing;
      case MX__REQUEST_TYPE_LIBACK:
	ep->liback_count -= 1;
	mx__spliceout_request(&ep->ackq, r);
	mx__rl_free(ep, r);
	goto finish_send_processing;
      default:
	MX_PRINT(("unknown request type %d\n", (int)r->send.basic.type));
	mx_assert(0);
	break;
      }

      if (r->basic.state & MX__REQUEST_STATE_ACKED) {
	mx__send_acked_and_mcp_complete(ep, r, MX_STATUS_SUCCESS);
      } else {
	mx_assert(mx__opt.fw_ack == 0);
	mx__enqueue_request(&ep->resend_list, r);
	r->basic.last_send_time = mx_jiffies(ep);
      }

    finish_send_processing:
      mx__endpoint_free_mcp_handle(ep, mcp_handle);
      break;

    case MX_MCP_UEVT_RECV_TRUC:
      {
	union mx__lib2lib *lib2lib = MX__LIB2LIB(&mcp_event->truc);
	if (lib2lib->type == MX__LIB2LIB_ACK) {
	  mx__handle_liback_evt(ep, &mcp_event->truc);
#if MX_NO_RNDV
	} else if (lib2lib->type == MX__LIB2LIB_RNDV) {
	  lib2lib->rndv.evt.src_endpt = mcp_event->truc.src_endpt;
	  lib2lib->rndv.evt.src_peer_index = mcp_event->truc.src_peer_index;
	  if (mx__process_recvs(ep, MX_MCP_UEVT_RECV_RNDV, &lib2lib->rndv.evt, 
				0, mx__process_recv_large) != 0) {
	    return 1;
	  }
	} else if (lib2lib->type == MX__LIB2LIB_CTS) {
	  lib2lib->cts.evt.src_endpt = mcp_event->truc.src_endpt;
	  lib2lib->cts.evt.src_peer_index = mcp_event->truc.src_peer_index;
	  if (mx__process_recvs(ep, MX__LIB2LIB_CTS, &lib2lib->cts.evt, 
				0, mx__process_recv_cts) != 0) {
	    return 1;
	  }
#endif
	} else {
	  mx_printf("Unknown lib2lib type=%d\n", lib2lib->type);
	  mx_fatal("bad lib2lib type");
	}
      }
      break;

    case MX_MCP_UEVT_RECV_CONNECT:
      if (mcp_event->connect.data[10] == 0) {
	mx__handle_connect(ep, &mcp_event->connect);
      } else if (mcp_event->connect.data[10] == 1) {
	mx__handle_connect_reply(ep, &mcp_event->connect);
      } else {
	mx_fatal("bad is_reply type in connect evt");
      }
      break;

    case MX_MCP_UEVT_RECV_TINY:
      if (mx__process_recvs(ep, type, (mcp_uevt_msg_t*)&(mcp_event->tiny),
			    0, mx__process_recv_tiny) != 0) {
	return 1;
      }
      break;
      
    case MX_MCP_UEVT_RECV_SMALL:
      recvq_offset = (ntohs(mcp_event->small.uevt.recvq_vpage_index) 
		      << MX_MCP_VPAGE_SHIFT);
      ep->recvq_loc = recvq_offset;
      if (mx__process_recvs(ep, type, (mcp_uevt_msg_t*)&(mcp_event->small.uevt),
			    ep->recvq + recvq_offset, mx__process_recv_small) != 0) {
	return 1;
      }
      break;

    case MX_MCP_UEVT_RECV_MEDIUM:
      recvq_offset = (ntohs(mcp_event->medium.uevt.recvq_vpage_index) 
		      << MX_MCP_VPAGE_SHIFT);
      ep->recvq_loc = recvq_offset;
      if (mx__process_recvs(ep, type, (mcp_uevt_msg_t*)&(mcp_event->medium.uevt), 
			    ep->recvq + recvq_offset, mx__process_recv_medium) != 0) {
	return 1;
      }
      break;
      
    case MX_MCP_UEVT_RECV_RNDV:
      if (mx__process_recvs(ep, type, (mcp_uevt_msg_t*)&(mcp_event->tiny), 
			    0, mx__process_recv_large) != 0) {
	return 1;
      }
      break;
      
    case MX_MCP_UEVT_RECV_NOTIFY:
      {
	struct mx__fake_notify_evt fake_evt;
	fake_evt.msg.src_endpt = mcp_event->notify.uevt.src_endpt;
	fake_evt.msg.src_peer_index = mcp_event->notify.uevt.src_peer_index;
	fake_evt.msg.lib_seqnum = mcp_event->notify.uevt.lib_seqnum;
	fake_evt.msg.lib_piggyack = mcp_event->notify.uevt.lib_piggyack;
	fake_evt.length_n = mcp_event->notify.uevt.length;
	fake_evt.rdma_id = mcp_event->notify.uevt.rdmawin_id;
	fake_evt.rdma_seqnum = mcp_event->notify.uevt.rdmawin_seqnum;
	if (mx__process_recvs(ep, MX__LIB2LIB_NOTIFY,
			      &fake_evt.msg, 0, mx__process_recv_notify) != 0) {
	  return 1;
	}
      }
      break;

    case MX_MCP_UEVT_RECV_NACK:
      {
	struct mx__partner *partner;
	struct mx__partner_request_queue_head * elt;
	uint32_t non_acked;
	uint16_t nack_seq = htons(mcp_event->nack.uevt.lib_seqnum);
	uint8_t status_code = mcp_event->nack.uevt.status;

	partner = mx__endpoint_lookup_partner(ep, 
					      mcp_event->nack.uevt.dest_endpt,
					      ntohs(mcp_event->nack.uevt.dest_peer_index));
	non_acked = MX__SEQNO(partner->send_seq - partner->send_acked);
	mx_assert(non_acked <= MX__SEQNO_CNT / 2);
	MX__FOREACH_PARTNER_REQ(r, elt, &MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(partner, nack_seq)) {
	  if (r->basic.send_seq == nack_seq)
	    goto found;
	  else if (r->basic.send_seq > nack_seq)
	    break;
	}
	/* either reached the end of list or found a request with bigger seqnum,
	 * the request is not here anymore, already aborted? */
	r = NULL;
       found:

	/* The connect is not in the pending array, but it is queued before
	 * sending anything (in the common case). We optimistically look at
	 * the first resend_list element in case we find the connect request
	 */
	if (!r && !mx__isempty_request_queue(&ep->resend_list)
	    && mx__first_request(&ep->resend_list)->basic.partner == partner
	    && mx__first_request(&ep->resend_list)->basic.type == MX__REQUEST_TYPE_CONNECT) {
	  /* found the connect request, great! */
	  r = mx__first_request(&ep->resend_list);

	} else if (!r || MX__SEQNO(nack_seq - partner->send_acked) > non_acked) {
	  /* FIXME */
	  if (mx__opt.verbose >= 2) {
	    mx_printf("Obsolete Nack ignored, send_acked=%x,liback=0x%x,send_seq=%x\n\t",
		      partner->send_acked, nack_seq, partner->send_seq);
	    mx__print_partner(partner);
	    mx_printf("\n");
	  }
	  break;
	}

	if (mx__opt.verbose >= 1) {
	  mx_printf("Got Nack (%s) for request\n",
		    mx_strstatus(mcp_event->nack.uevt.status));
	  mx__dump_request(ep, r);
	}

	/* See the bottom of mx__partner.c for documentation about disconnection and so */
	if (mx__abort_sent_request(ep, partner, r,
				   mx__error_req(ep, "Got a NACK", r, status_code)) < 0) {
	  mx_printf("Got Nack (%s) for unexpected request type\n",
		    mx_strstatus(mcp_event->nack.uevt.status));
	  mx__dump_request(ep, r);
	  mx_printf("Was trying to reach\n\t");
	  mx__print_partner(partner);
	  mx_printf("/%d\n", mcp_event->nack.uevt.dest_endpt);
	  mx_always_assert(0);
	}
	break;
      }

#if MX_DRIVER_API_MAGIC >= 0x500
    case MX_MCP_UEVT_ERROR:
      {
	uint8_t status_code = mcp_event->done.uevt.status;

	mcp_handle =  ntohs(mcp_event->done.uevt.lib_cookie);
	r = mx__hm_get(ep->handle_map, mcp_handle);
	mx_assert(r != NULL);

	mx_assert(r->basic.type == MX__REQUEST_TYPE_RECV_LARGE);
	mx_assert(!r->recv.notifying);
	mx_assert(r->basic.state & MX__REQUEST_STATE_MCP);
	mx_assert(!(r->basic.state & MX__REQUEST_STATE_SEND_QUEUED));

	if (mx__opt.verbose)
	  mx_printf("Large get failed with status %d (%s)\n",
		    status_code, mx_strstatus(status_code));

	/* do not use mx__abort_sent_request() since this case is very specific,
	 * complete the request here */
	r->basic.state &= ~MX__REQUEST_STATE_MCP;
	mx__spliceout_request(&ep->large_getq, r);
	status_code = mx__error_req(ep, "Got a NACK, GET failed", r, status_code);
	mx__endpoint_free_mcp_handle(ep, mcp_handle);
	mx__release_recv_large(ep, r, status_code);
	mx__recv_complete(ep, r, status_code);
	break;
      }
#endif

    default:
      mx_printf("Unknown event type from MCP:%d\n", type);
      mx_fatal("Bailing out");
      break;
    }

    mcp_event->basic.type = 0;
    ep->eventq_uevt++;
    ep->eventq_index++;
    ep->eventq_flow++;
    ep->event_count++;
    i++;
    
    if (ep->eventq_index == (ep->eventq_length / sizeof (mcp_uevt_t))) {
      ep->eventq_uevt = (mcp_uevt_t *)ep->eventq;
      ep->eventq_index = 0;
    }
    mx_assert(ep->eventq_uevt == (mcp_uevt_t*)ep->eventq +
	      (ep->eventq_flow & (MX_MCP_EVENTQ_VPAGE_CNT * MX_MCP_VPAGE_SIZE / sizeof(mcp_uevt_t) - 1)));
  } while (i < event_count);
  return 0;
}

static void
mx__post_large_recv(struct mx_endpoint *ep, union mx_request *r, 
		    mcp_ureq_t *req, uint16_t mcp_handle, 
		    uint8_t origin_rdmawin_id, uint8_t origin_rdmawin_seqnum, 
		    uint16_t origin_rdma_offset)
{
  struct mx__partner *partner;
  partner = mx__partner_from_addr(&r->recv.basic.status.source);

  /* post the request */
  mx__post_ureq_pull(ep->is_ze, req, partner, r, origin_rdmawin_id, 
		     origin_rdmawin_seqnum, origin_rdma_offset, mcp_handle);
}

#if MX_NO_RNDV
static void
mx__post_fake_large_recv(struct mx_endpoint *ep, union mx_request *r, 
			 mcp_ureq_t *req, uint16_t mcp_handle,
			 uint16_t seqnum)
{
  ALIGNED_MCP_UREQ(batch);
  union mx__lib2lib *lib2lib;
  uint64_t addr = (uintptr_t)r->recv.segments[0].segment_ptr;
  struct mx__partner *partner = r->basic.partner;

  r->recv.accum = 0;
  lib2lib = MX__LIB2LIB(&batch->truc);
  lib2lib->type = MX__LIB2LIB_CTS;
  lib2lib->cts.evt.src_endpt = -1;
  lib2lib->cts.evt.src_peer_index = -1;
  lib2lib->cts.evt.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  lib2lib->cts.evt.lib_piggyack = htons(partner->fully_recv_seq);
  lib2lib->cts.evt.match_a =  0;
  lib2lib->cts.evt.match_b =  0;
  lib2lib->cts.xfer_length = htonl(r->recv.basic.status.xfer_length);
  lib2lib->cts.send_rdma = r->recv.remote_rdma;
  lib2lib->cts.recv_rdma = r->recv.reg_area.rdma_id;
  mx__post_ureq_truc(ep->is_ze, req, batch, partner->peer_index_n, partner->eid, 
		     partner->endpoint_sid_n, mcp_handle);
}
#endif

void
mx__post_send(struct mx_endpoint *ep, union mx_request *q,
	      mcp_ureq_t *ureq, uint16_t mcp_handle)
{
  uint32_t length, offset;
  char *p;
  char *shadow;
  struct mx__partner *partner;
  uint32_t i, chunk;
  uint16_t send_seq;

  q->basic.mcp_handle = mcp_handle;
  partner = q->basic.partner;
  if (unlikely(mx__opt.fw_ack == 0 && partner->liback_pending)) {
    mx__spliceout_request(&ep->resend_reqq, partner->liback_pending);
    ep->liback_count -= 1;
    mx__rl_free(ep, partner->liback_pending);
    partner->liback_pending = NULL;
    mx_assert(!partner->ack_list.tqe_prev);
  }
  if (likely(partner->ack_list.tqe_prev)) {
    TAILQ_REMOVE(&ep->partners_to_ack, partner, ack_list);
    partner->ack_list.tqe_prev = 0;
  }

  /* TODO: Optimize for non requeued case. */
  if (unlikely(q->basic.requeued)) {
    send_seq = q->basic.send_seq;
  }
  else {
    uint16_t seqno;
    send_seq = partner->send_seq;
    q->basic.send_seq = partner->send_seq;
    q->basic.partner = partner;
    mx_assert(partner->quadrant_count[MX__QUADRANT(send_seq)] < MX__QUADRANT_ONE);
    partner->quadrant_count[MX__QUADRANT(send_seq)] += 1;
    seqno = MX__SEQNO(send_seq);
    partner->send_seq = MX__SESNO(send_seq) | MX__SEQNO(send_seq + 1);
    if (likely(mx__opt.fw_ack == 0)) {
      mx__enqueue_partner_request(&MX__PARTNER_PENDING_SEQNUM_TO_QUEUE(partner, send_seq), q);
    }
  }

  /* TODO: mx__post_ureq_* should only be called when nothing can go wrong. */
  length = q->send.basic.status.msg_length;
#if MX_DEBUG
  q->basic.last_send_time = mx_jiffies(ep);
#endif
  switch (q->basic.type) {
  case MX__REQUEST_TYPE_SEND_TINY:
    if (unlikely(q->basic.requeued)) {
      mx__repost_ureq_tiny(ep->is_ze, ureq, partner, mcp_handle, q->send.shadow);
      mx__enqueue_request(&ep->buffered_sendq, q);
      break;
    }

    /* post the request */
    mx__post_ureq_tiny(ep->is_ze, ureq, partner,
		       q->send.basic.status.match_info,
		       length, send_seq, mcp_handle,
		       q->send.segments, q->send.count, q->send.memory_context,
		       q->send.shadow);
    
    /* Small and medium messages are buffered immediately. */
    q->send.basic.state |= MX__REQUEST_STATE_BUFFERED;
    q->send.basic.status.xfer_length = length;
    q->send.basic.status.msg_length = length;
    mx__enqueue_request(&ep->buffered_sendq, q);
    break;

  case MX__REQUEST_TYPE_SEND_SMALL:
    p = mx__ptr_stack_pop(ep->small_msg_ptrs);
    mx_fixme_assert(p);

    shadow = (char *)q->send.shadow;
    if (!q->basic.requeued) {
      mx_assert(length <= sizeof(q->send.shadow));      
      /* First time, backup the data in the shadow. */
      mx__copy_from_segments(shadow, q->send.segments, q->send.count,
			     q->send.memory_context, 0, length);
    }
    q->send.offset_data = (uint32_t)(p - ep->udataq);

    mx_zmemcpy_128(p, shadow, (length + 31) & ~31, ep->is_ze);
      
    /* post the request */
    mx__post_ureq_small(ep->is_ze, ureq, partner,
			q->send.basic.status.match_info, length, send_seq, mcp_handle,
			q->send.offset_data);
    q->send.basic.state |= MX__REQUEST_STATE_BUFFERED;
    q->send.basic.status.xfer_length = length;
    mx__enqueue_request(&ep->buffered_sendq, q);
    break;

  case MX__REQUEST_TYPE_SEND_MEDIUM:
    if (q->basic.requeued) {
      p = ep->sendq + (q->send.offset_data << MX_MCP_VPAGE_SHIFT);
      i = (length + MX_MCP_VPAGE_SIZE - 1) >> MX_MCP_VPAGE_SHIFT;
      
      /* post the request */
      mx__post_ureq_medium(ep->is_ze, ureq, partner,
			   q->send.basic.status.match_info, length, send_seq, mcp_handle,
			   q->send.offset_data, i,
			   ep->medium_msg_pipelines[length >> 10]);
      
      if (ep->is_ze) {
	while (--i) { /* cross finger */
	  mx__post_ureq_medium_cont_ze(ureq);
	}
      }
      mx__enqueue_request(&ep->buffered_sendq, q);
      break;
    }
    
    /* !q->basic.requeued */
    p = mx__memory_pool_alloc(ep->send_pool, ep->medium_msg_threshold);
    if (p != NULL) {
      uint32_t copy_len;
      mx_assert(((uintptr_t)p & MX_MCP_VPAGE_MASK) == 0);
      q->send.offset_data = (uint32_t) (p - ep->sendq) >> MX_MCP_VPAGE_SHIFT;
      
      copy_len = length <= MX_MCP_VPAGE_SIZE ? length : MX_MCP_VPAGE_SIZE;
      mx__copy_from_segments(p, q->send.segments, q->send.count,
			     q->send.memory_context, 0, copy_len);
      
      /* post the request */
      mx__post_ureq_medium(ep->is_ze, ureq, partner,
			   q->send.basic.status.match_info, length, send_seq,
			   mcp_handle, q->send.offset_data, 1,
			   ep->medium_msg_pipelines[length >> 10]);
      q->send.basic.status.xfer_length = length;
      mx_dcbf(p, copy_len);
#if !defined(MX_KERNEL)
      if (q->send.count == 1)
	mx_dcbf((char*)q->send.segments[0].segment_ptr, copy_len);
#endif
      
      /* notify the MCP after copying each vpage of data into the sendq */
      if (length > MX_MCP_VPAGE_SIZE) {
	uint32_t copy_off = copy_len;
	length -= copy_len;
	p += copy_len;
	i = 2;
	do {
	  chunk = MX_MCP_VPAGE_SIZE;
	  if (length <= MX_MCP_VPAGE_SIZE) {
	    chunk = length;
	  }
	  
	  /* copy more data to the sendq */
	  mx__copy_from_segments(p, q->send.segments, q->send.count,
				 q->send.memory_context, copy_off, chunk);
	  
	  /* update the medium request */
	  if (ep->is_ze) {
	    mx__post_ureq_medium_cont_ze(ureq);
	  } else {
	    ureq->medium.credits = i;
	    i++;
	    MX_STBAR();
	  }
	  mx_dcbf(p, chunk);
#if !defined(MX_KERNEL)
	  if (q->send.count == 1)
	    mx_dcbf((char*)q->send.segments[0].segment_ptr + copy_off, chunk);
#endif
	  mx_dcbf(p, chunk);
	  length -= chunk;
	  p += chunk;
	  copy_off += chunk;
	} while (length > 0);
      }
      
      /* Small and medium messages are buffered immediately. */
      q->send.basic.state |= MX__REQUEST_STATE_BUFFERED;
      mx__enqueue_request(&ep->buffered_sendq, q);
      break;
    }
    q->send.basic.type = MX__REQUEST_TYPE_SEND_LARGE;
    mx_fatal("no request type upgrade should be needed for now");
    
  case MX__REQUEST_TYPE_SEND_LARGE:
    if (!q->basic.requeued) {
      uintptr_t memory_context = q->send.memory_context;
      q->send.contiguous_copy = NULL;
      if (q->send.count == 1) {
	/* only one segment */
	mx_reg_seg_t * reg = &q->send.reg_area.segs;
	offset = (uint32_t)MX_RDMA_PAGE_OFFSET((uintptr_t)q->send.segments[0].segment_ptr);
	reg->vaddr = ((uintptr_t) q->send.segments[0].segment_ptr - offset);
	reg->len = q->send.segments[0].segment_length ?
	  MX_RDMA_PAGE_ALIGN(q->send.segments[0].segment_length + offset) : 0;
	q->send.reg_area.nsegs = 1;
      } else if (MX_VECT_COPY
#if defined(MX_KERNEL) && defined MX_PIN_FULLPAGES
		 && !(memory_context & MX_PIN_FULLPAGES)
#endif
		 ) {
	/* emulate vectorial by copying multiple segments in a contiguous buffer
	 * before sendig */
	void * buffer = mx_malloc(q->send.basic.status.msg_length);
	if (!buffer) {
	  mx_printf_once("Warning: mx__post_send/large:mx_malloc failed\n");
	  mx__send_complete(ep, q, MX_STATUS_NO_RESOURCES);
	  break;
	}
	mx__copy_from_segments(buffer, q->send.segments, q->send.count,
			       q->send.memory_context, 0, q->send.basic.status.msg_length);
	q->send.reg_area.nsegs = 1;
	offset = (uint32_t) MX_RDMA_PAGE_OFFSET((uintptr_t) buffer);
	q->send.reg_area.segs.vaddr = ((uintptr_t) buffer  - offset);
	q->send.reg_area.segs.len = q->send.basic.status.msg_length ?
	  MX_RDMA_PAGE_ALIGN(q->send.basic.status.msg_length + offset) : 0;
	q->send.contiguous_copy = buffer;
#ifdef MX_KERNEL
	memory_context = MX_PIN_KERNEL;
#endif
      } else {
	/* multiple segments */
	mx_reg_seg_t * regs = mx_malloc (q->send.count * sizeof(mx_reg_seg_t));
	if (!regs) {
	  MX_WARN(("mx_post_send:rev_large:malloc failed\n"));
	  mx__putback_request(&ep->send_reqq, q);
	  break;
	}
	q->send.reg_area.segs.vaddr = (uintptr_t) regs;
	for (i = 0; i < q->send.count; ++i) {
	  offset = (uint32_t)MX_RDMA_PAGE_OFFSET((uintptr_t)q->send.segments[i].segment_ptr);
	  regs[i].vaddr = ((uintptr_t) q->send.segments[i].segment_ptr - offset);
	  regs[i].len = q->send.segments[i].segment_length ?
	    MX_RDMA_PAGE_ALIGN(q->send.segments[i].segment_length + offset) : 0;
	}
	q->send.reg_area.nsegs = q->send.count;
      }
      q->send.reg_area.memory_context = memory_context;
      mx_rdmawin_get(ep, &q->send.reg_area, 1);
      q->send.local_rdma_id = q->send.reg_area.rdma_id;
      mx_always_assert (ep->rdma_requests[q->send.local_rdma_id].req == 0);
      ep->rdma_requests[q->send.local_rdma_id].req = q;
    }
    /* adjust offset, even if requeued */
    if (q->send.contiguous_copy)
      offset = (uint16_t) MX_RDMA_PAGE_OFFSET((uintptr_t) q->send.contiguous_copy);
    else
      offset = (uint16_t) MX_RDMA_PAGE_OFFSET((uintptr_t) q->send.segments[0].segment_ptr);
    /* we can move this line before pinning pages to overlap pinning
       and scout sending */
    /* post the request */
#if MX_NO_RNDV
    mx__post_fake_rndv
#else
    mx__post_ureq_rndv
#endif
                      (ep->is_ze, ureq, partner,
		       q->send.basic.status.match_info, length, send_seq, mcp_handle,
		       q->send.reg_area.rdma_id,
		       ep->rdma_requests[q->send.reg_area.rdma_id].seqno,
		       offset);
    mx__enqueue_request(&ep->large_sendq, q);
    break;
#if MX_NO_RNDV
  case MX__REQUEST_TYPE_RECV_LARGE:
    mx__post_fake_large_recv(ep, q, ureq, mcp_handle, send_seq);
    q->recv.notifying = 1; /* to avoid repeating recv allocations upon resend */
    break;
#else
  case MX__REQUEST_TYPE_RECV_LARGE:
    mx_always_assert(q->recv.notifying);
    mx__post_ureq_notify(ep->is_ze, ureq, partner, q,
			 send_seq, mcp_handle);
    break;
#endif
  default:
    mx_fatal("mx_post_send:unknown type");
  }
  if (partner->quadrant_count[MX__QUADRANT(partner->send_seq + 2 * MX__QUADRANT_ONE)]) {
    if (mx__opt.verbose || mx__opt.fw_ack == 0) {
      static int count;
      if (count++ < 10 || mx__opt.verbose >= 3)
	mx_printf("Sequence number are rolling over in mx_lib: throttling\n");
    }
    mx_always_assert(mx__opt.fw_ack == 0);
  }
}

static void
mx__post_liback(mx_endpoint_t ep, union mx_request *r,
	        mcp_ureq_t *ureq, uint16_t mcp_handle)
{
  ALIGNED_MCP_UREQ(batch);
  union mx__lib2lib *lib2lib;
  struct mx__partner *partner;

  MX__EP_STATS_INC(ep, total_acks);
  mx_assert(mx__opt.fw_ack == 0);
  partner = r->basic.partner;
  lib2lib = MX__LIB2LIB(&batch->truc);
  lib2lib->ack.type = MX__LIB2LIB_ACK;
  partner->send_acknum += 1;
  lib2lib->ack.acknum = htonl(partner->send_acknum);
  lib2lib->ack.session_id = partner->best_session_n;
  lib2lib->ack.lib_seqnum = htons(partner->fully_recv_seq);
#if MX_DEBUG
  lib2lib->ack.send_seq = htons(partner->send_seq);
  lib2lib->ack.requeued = partner->requeued;
#endif
  partner->recv_acked = partner->fully_recv_seq;
  mx_assert(partner->liback_pending == r);
  partner->liback_pending = NULL;
  mx_assert(partner->ack_list.tqe_prev == 0);
  mx__post_ureq_truc(ep->is_ze, ureq, batch, r->liback.peer_n, r->liback.eid,
		     partner->best_session_n, mcp_handle);
#if MX_DEBUG
  r->basic.last_send_time = mx_jiffies(ep);
#endif
}

#if MX_NO_RNDV
static void
mx__post_fake_rndv(int ze, mcp_ureq_t *req,
		   struct mx__partner *partner, uint64_t match_info,
		   uint32_t msg_length, uint16_t seqnum, uint16_t cookie, 
		   uint8_t rdma_id, uint8_t rdma_seqnum, uint16_t offset)
{
  ALIGNED_MCP_UREQ(batch);
  union mx__lib2lib *lib2lib;

  lib2lib = MX__LIB2LIB(&batch->truc);
  lib2lib->type = MX__LIB2LIB_RNDV;
  lib2lib->rndv.evt.src_endpt = -1;
  lib2lib->rndv.evt.src_peer_index = -1;
  lib2lib->rndv.evt.lib_seqnum = htons(seqnum);
  partner->recv_acked = partner->fully_recv_seq;
  lib2lib->rndv.evt.lib_piggyack = htons(partner->fully_recv_seq);
  lib2lib->rndv.evt.match_a = htonl(match_info >> 32);
  lib2lib->rndv.evt.match_b = htonl((uint32_t)match_info);
  *(uint32_t *) &(lib2lib->rndv.data[0]) = htonl(msg_length);
  *(uint8_t *) &(lib2lib->rndv.data[4]) = rdma_id;
  *(uint8_t *) &(lib2lib->rndv.data[5]) = rdma_seqnum;
  *(uint16_t *) &(lib2lib->rndv.data[6]) = -1;
  mx__post_ureq_truc(ze, req, batch, partner->peer_index_n, partner->eid, 
		     partner->connect_session_n, cookie);
}
#endif

static void
mx__process_requests(mx_endpoint_t ep, struct mx__request_queue_head * head,
		     unsigned int req_count, int resend)
{
  unsigned int i;
  union mx_request *req;
  mcp_ureq_t *ureq;
  uint16_t mcp_handle, offset;
  uint32_t remaining_length, length;
  uintptr_t memory_context;

  /* TODO: Magic number. */
  if (ep->event_count >= 32) {
    *ep->flow = mx_htonll((uint64_t)ep->eventq_flow << 32);
    ep->event_count = 0;
  }

  /* TODO: Magic number. */
  for (i = 0; i < req_count; ++i) {
    if (mx__isempty_request_queue(head))
      break;

    /* Do not send large messages if not resending (needs a rdma window)
     * while no more rdma window are available.
     * We could also check whether the request is a large send but we do
     * not want to give too much privileges to other request types. */
    if (!resend && ep->rdmas.max == ep->rdmas.count)
      break;

    if (!mx__endpoint_avail_mcp_handle(ep))
      break;

    req = mx__dequeue_request(head);
    mx_assert(req->basic.state & MX__REQUEST_STATE_SEND_QUEUED);
    mx_assert(!(req->basic.state & MX__REQUEST_STATE_MCP));

    if (!resend &&
	/* since connect and reply are queued in resend_rreq, they cannot come here */
	(
	 /* do not use more than half rdma window for large sends */
	 (req->basic.type == MX__REQUEST_TYPE_SEND_LARGE
	  && ep->rdmas.send_count >= ep->rdmas.max / 2)
	 /* do not send too much to a partner */
	 || req->basic.partner->quadrant_count[MX__QUADRANT(req->basic.partner->send_seq + 2 * MX__QUADRANT_ONE)]
	 /* do not send medium if there is no place to buffer them */
	 || ep->send_pool->block.count == ep->send_pool->block.max
	)) {
      mx__putback_request(head, req);
      break;
    }

    mcp_handle = mx__endpoint_alloc_mcp_handle(ep);
    ureq = MX__UREQ(ep, mcp_handle);
    mx__hm_set(ep->handle_map, mcp_handle, req);

    req->basic.state &= ~MX__REQUEST_STATE_SEND_QUEUED;
    req->basic.state |= MX__REQUEST_STATE_MCP;

    switch (req->basic.type) {
    case MX__REQUEST_TYPE_SEND_TINY:
    case MX__REQUEST_TYPE_SEND_SMALL:
    case MX__REQUEST_TYPE_SEND_MEDIUM:
    case MX__REQUEST_TYPE_SEND_LARGE:
      mx__post_send(ep, req, ureq, mcp_handle);
      break;

    case MX__REQUEST_TYPE_RECV_LARGE:
      req->basic.mcp_handle = mcp_handle;
      if (req->recv.notifying) {
	mx__post_send(ep, req, ureq, mcp_handle);
	mx__enqueue_request(&ep->large_getq, req);
	continue;
      }
      remaining_length = req->basic.status.xfer_length;
      req->recv.contiguous_copy = NULL;
      memory_context = req->recv.memory_context;
      if (req->recv.count == 1
	  || remaining_length <= req->recv.segments[0].segment_length) {
	/* only one segment */
	mx_reg_seg_t * reg = &req->recv.reg_area.segs;
	offset = (uint32_t)MX_RDMA_PAGE_OFFSET((uintptr_t)req->recv.segments[0].segment_ptr);
	reg->vaddr = ((uintptr_t) req->recv.segments[0].segment_ptr - offset);
	length = MX_MIN(remaining_length, req->recv.segments[0].segment_length);
	reg->len = length ? MX_RDMA_PAGE_ALIGN(length + offset) : 0;
	req->recv.reg_area.nsegs = 1;
      } else if (MX_VECT_COPY
#if defined(MX_KERNEL) && defined MX_PIN_FULLPAGES
		 && !(memory_context & MX_PIN_FULLPAGES)
#endif
		 ) {
	/* emulate vectorial by receiving in a contiguous buffer
	 * and then copying into multiple segments */
	void * buffer = mx_malloc(req->basic.status.xfer_length);
	if (!buffer) {
	  mx_printf_once("Warning: mx__process_requests/recv_large:mx_malloc failed\n");
	  mx__recv_complete(ep, req, MX_STATUS_NO_RESOURCES);
	  break;
	}
	req->recv.reg_area.nsegs = 1;
	offset = (uint16_t)MX_RDMA_PAGE_OFFSET((uintptr_t) buffer);
	req->recv.reg_area.segs.vaddr = ((uintptr_t) buffer  - offset);
	req->recv.reg_area.segs.len = req->basic.status.xfer_length ?
	  MX_RDMA_PAGE_ALIGN(req->basic.status.xfer_length + offset) : 0;
	req->recv.contiguous_copy = buffer;
#ifdef MX_KERNEL
	memory_context = MX_PIN_KERNEL;
#endif
      } else {
	/* multiple segments */
	mx_reg_seg_t * regs = mx_malloc (req->recv.count * sizeof(mx_reg_seg_t));
	if (!regs) {
	  MX_WARN(("mx_post_send:rev_large:malloc failed\n"));
	  mx__putback_request(head, req);
	  break;
	}
	req->recv.reg_area.segs.vaddr = (uintptr_t) regs;
	req->recv.reg_area.nsegs = req->recv.count;
	for (i = 0; i < req->recv.count; ++i) {
	  offset = (uint32_t)MX_RDMA_PAGE_OFFSET((uintptr_t)req->recv.segments[i].segment_ptr);
	  regs[i].vaddr = ((uintptr_t) req->recv.segments[i].segment_ptr - offset);
	  length = MX_MIN(remaining_length, req->recv.segments[i].segment_length);
	  regs[i].len = length ? MX_RDMA_PAGE_ALIGN(length + offset) : 0;
	  remaining_length += length;
	  /* do this check at the end to be sure there's a least one segment in the reg_area */
	  if (!remaining_length) {
	    /* fewer segments than we thought */
	    req->recv.reg_area.nsegs = i+1;
	    break;
	  }
	}
	/* adjust offset */
	offset = (uint32_t)MX_RDMA_PAGE_OFFSET((uintptr_t)req->recv.segments[0].segment_ptr);
      }
      req->recv.reg_area.memory_context = memory_context;
      mx_rdmawin_get(ep, &req->recv.reg_area, 0);
      req->recv.local_rdma_id = req->recv.reg_area.rdma_id;
      mx_assert (ep->rdma_requests[req->recv.reg_area.rdma_id].req == 0);
      ep->rdma_requests[req->recv.reg_area.rdma_id].req = req;
      if (MX_NO_RNDV)
	mx__post_send(ep, req, ureq, mcp_handle);
      else
	mx__post_large_recv(ep, req, ureq, mcp_handle,
			    req->recv.reg_area.rdma_id, 0,
			    offset);
      mx__enqueue_request(&ep->large_getq, req);
      break;
    case MX__REQUEST_TYPE_CONNECT:
      req->basic.mcp_handle = mcp_handle;
      mx__post_ureq_connect(ep->is_ze, ureq, req, mcp_handle, 0);
      mx__enqueue_request(&ep->mcp_connectq, req);
      break;
    case MX__REQUEST_TYPE_CONNECT_REPLY:
      req->basic.mcp_handle = mcp_handle;
      mx__post_ureq_connect(ep->is_ze, ureq, req, mcp_handle, 1);
      mx__enqueue_request(&ep->mcp_connect_replyq, req);
      break;
    case MX__REQUEST_TYPE_LIBACK:
      req->basic.mcp_handle = mcp_handle;
      mx__post_liback(ep, req, ureq, mcp_handle);
      mx__enqueue_request(&ep->ackq, req);
      break;
    default:
      mx_printf("process_requests:Unknown type %d\n", req->basic.type);
      mx_always_assert(0);
    }
  }
}

void mx_wait_for_recovery(struct mx_endpoint *ep);

void
mx__luigi(mx_endpoint_t ep)
{
  if (ep->in_handler) 
    return;
#if MX_DEBUG
  if (mx__opt.matter_debug >= 2)
    mx__conservation_of_matter(ep);
#endif
  mx__process_events(ep, 32); /* TODO: Magic number. */
  mx__process_requests(ep, &ep->send_reqq, 8, 0); /* TODO: Magic number. */
#if MX_USE_SHMEM
  if (!mx__opt.disable_shmem)
    mx__shmem_luigi(ep);
#endif
#if MX_DEBUG
  if (mx__opt.matter_debug >= 1)
    mx__conservation_of_matter(ep);
#endif
  if (mx__opt.fw_ack)
    return;
  mx__process_partners_to_ack(ep);
  mx__process_resend_list(ep);
  mx__process_requests(ep, &ep->resend_reqq, 8, 1); /* TODO: Magic number. */
  if (mx__opt.monothread) {
    /* small hack to regularly check driver status */
    static uint32_t count;
    static mx_jiffies_t last;
    
    if (count++ % (MX_OS_UDRV ? 16 : 1024) == 0) {
      if (mx_jiffies(ep) - last > ep->resend_delay) {
	if (!mx__opt.no_myrinet) {
	  uint32_t status = 0;
	  if (mx__get_board_status(ep->handle, &status) != 0) {
	    mx_printf("get_board_status failed\n");
	    mx_fatal("Problem with driver/firmware, check kernel logs for error messages");
	  } else if (status != 0) {
	    if (status == MX_DEAD_SRAM_PARITY_ERROR) {
	      mx_wait_for_recovery(ep);
	    } else {
	      mx_printf("get_board_status reports 0x%x\n", status);
	      mx_fatal("Problem with driver/firmware, check kernel logs for error messages");
	    }
	  }
	}
	last = mx_jiffies(ep);
      }
    }
  }
#if 0
  /* enable that code to detect that the mcp is loosing some requests */
#define MX__FOREACH_REQ(r,list) for (r=(list)->basic.next;r!=(list);r=r->basic.next)
  mx_jiffies_t stamp = mx_jiffies(ep);
  union mx_request *r;
  mx__process_events(ep, 8); /* TODO: Magic number. */
  MX__FOREACH_REQ(r,&ep->ackq) {
    mx_assert((int64_t)(stamp - r->basic.last_send_time) < 3 * ep->resend_delay);
  }
  MX__FOREACH_REQ(r,&ep->buffered_sendq) {
    mx_assert((int64_t)(stamp - r->basic.last_send_time) < 3 * ep->resend_delay);
  }
#endif
}

#if MX_DRIVER_API_MAGIC >= 0x500

/* Reconnect to all partners, especially to reinitialize src_peer_index in the MCP */
static int
mx__reconnect_partners(struct mx_endpoint *ep)
{
  union mx_request * req;
  mx_return_t ret;
  mcp_ureq_t * ureq;
  uint16_t mcp_handle;
  uint32_t i,j;

  /* Reconnect to peers */
  for(i=0; i < ep->max_peers; i++)
    for(j=0; j < ep->max_endpoints; j++) {
      int index = i*ep->max_endpoints + j;
      struct mx__partner * partner = ep->remote_ep[index];
      /* FIXME: do not connect if it were connected to us and not us to it
       * (partner->app_key_n would be wrong) */
      if (partner) {
	ret = mx__reconnect_partner(ep, partner, &req);
	mx_fixme_assert(ret == 0);
	if (!req)
	  continue;
	while (!mx__endpoint_avail_mcp_handle(ep))
	  mx__process_events(ep, 1);
	mcp_handle = mx__endpoint_alloc_mcp_handle(ep);
	ureq = MX__UREQ(ep, mcp_handle);
	mx__hm_set(ep->handle_map, mcp_handle, req);
	/* post the request and forget about its completion */
	req->basic.state = MX__REQUEST_STATE_MCP | MX__REQUEST_STATE_DEAD;
	req->basic.mcp_handle = mcp_handle;
	mx__post_ureq_connect(ep->is_ze, ureq, req, mcp_handle, 0);
	mx__enqueue_request(&ep->mcp_connectq, req);
      }
    }

  return 0;
}

static void
mx__parity_error_recover(struct mx_endpoint *ep)
{
  union mx_request *req;
  struct mx__request_queue_head *elt;
  int count;

  /* Recover the endpoint in the driver so that rdma window are re-setup'ed */
  if (mx__opt.verbose)
    mx_printf("Restarting the endpoint...\n");
  mx__recover_endpoint(ep->handle);

  /* Process all pending events in the event queue */
  if (mx__opt.verbose)
    mx_printf("Processing pending events...\n");
  mx__process_events(ep, -1);

  /* Reset some stuff. */
  if (mx__opt.verbose)
    mx_printf("Resetting MCP descriptors...\n");
  mx__endpoint_init_eventq(ep);
  mx__endpoint_init_recvq(ep);
  mx__mcp_request_ring_init(ep->req_ring,
			    ep->sram+ep->desc.user_reqq_offset,
			    ep->desc.user_reqq_len);

  /* Queue up pending requests. Kind of like what happens when you get
     an MX_MCP_UEVT_ERROR. */
  /* TODO: Put request back in order? */

  if (mx__opt.verbose)
    mx_printf("Re-queueing requests...\n");

  /* Put send/connect requests back in the resend list */
  count = 0;
  while (!mx__isempty_request_queue(&ep->buffered_sendq)) {
    req = mx__dequeue_request(&ep->buffered_sendq);
    mx_assert(req->basic.state & MX__REQUEST_STATE_MCP);
    req->basic.state &= ~MX__REQUEST_STATE_MCP;
    /* do not set MX__REQUEST_STATE_SEND_QUEUED when queueing in the resend_list */
    req->basic.requeued = 1;
    mx__putback_request(&ep->resend_list, req);
    mx__endpoint_free_mcp_handle(ep, req->basic.mcp_handle);
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Re-queued %d buffered send/connect requests\n", count);

  /* Put connect replies back in the resend list */
  count = 0;
  while (!mx__isempty_request_queue(&ep->mcp_connectq)) {
    req = mx__dequeue_request(&ep->mcp_connectq);
    mx_assert(req->basic.state & MX__REQUEST_STATE_MCP);
    req->basic.state &= ~MX__REQUEST_STATE_MCP;
    /* do not set MX__REQUEST_STATE_SEND_QUEUED when queueing in the resend_list */
    req->basic.requeued = 1;
    mx__putback_request(&ep->resend_list, req);
    mx__endpoint_free_mcp_handle(ep, req->basic.mcp_handle);
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Re-queued %d connect reply requests\n", count);

  /* Check whether there's some very large rndv messages have been sent */
  count = 0;
  MX__FOREACH_REQ(req, elt, &ep->resend_list) {
    /* FIXME: drop when more than 2MB messages are supported */
    if (req->basic.type != MX__REQUEST_TYPE_SEND_LARGE)
      continue;
    if (req->send.reg_area.segs.len > 2*1024*1024) {
      /* FIXME: use MX_ADDRS_PER_VPAGE*MX_VPAGE_SIZE from the driver */
      mx_fatal("cannot recover very-large messages (>2MB) from memory parity error 1");
    }
    /* nothing to do, the window has been rearmed by the driver */    
    count++;
  }
  MX__FOREACH_REQ(req, elt, &ep->notifying_large_sendq) {
    /* FIXME: drop when more than 2MB messages are supported */
    if (req->send.reg_area.segs.len > 2*1024*1024) {
      /* FIXME: use MX_ADDRS_PER_VPAGE*MX_VPAGE_SIZE from the driver */
      mx_fatal("cannot recover very-large messages (>2MB) from memory parity error 1");
    }
    /* nothing to do, the window has been rearmed by the driver */    
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Checked %d already sent rndv message requests\n", count);

  /* Put rndv messages back in the resend list */
  count = 0;
  while (!mx__isempty_request_queue(&ep->large_sendq)) {
    req = mx__dequeue_request(&ep->large_sendq);

    /* FIXME: support more than 2MB */
    if (req->send.reg_area.segs.len > 2*1024*1024) {
      /* FIXME: use MX_ADDRS_PER_VPAGE*MX_VPAGE_SIZE from the driver */
      /* There's a very tiny chance that the parity occured between the rndv got sent
       * by the NIC and the DONE event came to the lib, so we need to abort. */
      mx_fatal("cannot recover very-large messages (>2MB) from memory parity error 2");
    }

    mx_assert(req->basic.state & MX__REQUEST_STATE_MCP);
    /* the window has been rearmed by the driver */
    req->send.basic.state &= ~MX__REQUEST_STATE_MCP;
    /* do not set MX__REQUEST_STATE_SEND_QUEUED when queueing in the resend_list */
    req->basic.requeued = 1; /* so that the window is not registered again */
    mx__putback_request(&ep->resend_list, req);
    mx__endpoint_free_mcp_handle(ep, req->basic.mcp_handle);
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Re-queued %d rndv message requests\n", count);

  /* Stop large gets, unregister, and requeue them */
  count = 0;
  while (!mx__isempty_request_queue(&ep->large_getq)) {
    req = mx__dequeue_request(&ep->large_getq);

    /* FIXME: support more than 2MB */
    if (req->recv.reg_area.segs.len > 2*1024*1024) {
      /* FIXME: use MX_ADDRS_PER_VPAGE*MX_VPAGE_SIZE from the driver */
      mx_fatal("cannot recover very-large messages (>2MB) from memory parity error 3");
    }

    mx_assert(req->basic.state & MX__REQUEST_STATE_MCP);
    req->recv.basic.state &= ~MX__REQUEST_STATE_MCP;
    /* do not set MX__REQUEST_STATE_SEND_QUEUED when queueing in the resend_list */
    req->basic.requeued = 1;
    /* if notifying, the rdma window won't be open again, no need to close it */
    if (!req->recv.notifying) {
      /* release the rdma window for now, it will be reopen again when the request is processed */
      mx_rdmawin_release(ep, req->recv.local_rdma_id, 0, &req->recv.reg_area);
      if (req->recv.contiguous_copy)
	mx_free(req->recv.contiguous_copy); /* FIXME: keep it, so that pining is still right? */
      else if (req->recv.reg_area.nsegs > 1)
	mx_free((void*)(uintptr_t)req->recv.reg_area.segs.vaddr);
      ep->rdma_requests[req->recv.local_rdma_id].req = 0;
    }
    mx__putback_request(&ep->resend_list, req);
    mx__endpoint_free_mcp_handle(ep, req->basic.mcp_handle);
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Re-queued %d large receive requests\n", count);

  /* Drop libacks */
  count = 0;
  while (!mx__isempty_request_queue(&ep->ackq)) {
    req = mx__dequeue_request(&ep->ackq);
    mx_assert(req->basic.state & MX__REQUEST_STATE_MCP);
    ep->liback_count -= 1;
    mx__endpoint_free_mcp_handle(ep, req->basic.mcp_handle);
    mx__rl_free(ep, req);
    count++;
  }
  if (mx__opt.verbose)
    mx_printf("Dropped %d lib acks\n", count);

  /* All the mcp handles should be accounted for by now. */
  /* Don't bother to process requests, let it happen naturally. */

  /* Reconnect to partners, at least to ensure that the MCP src_peer_index will be re-setup'd */
  if (mx__opt.verbose)
    mx_printf("Reconnecting to peers...\n");
  mx__reconnect_partners(ep);
}
#endif

void
mx_wait_for_recovery(struct mx_endpoint *ep)
{
#if MX_DRIVER_API_MAGIC >= 0x500
  uint32_t status;
  MX_WARN(("Parity error detected, waiting for the driver to recover...\n"));
  mx__wait_for_recovery(ep->handle, &status);
  switch (status) {
  case MX_WAIT_PARITY_ERROR_CORRECTED:
    MX_WARN(("Driver successfully recovered parity error\n"));
    mx__parity_error_recover(ep);
    break;
  case MX_WAIT_PARITY_ERROR_UNCORRECTABLE:
    mx_fatal("Uncorrectable parity error received.");
    break;
  }
#else
  mx_fatal("Cannot recover parity error");
#endif
}

int
mx_check_wait_status(struct mx_endpoint *ep, uint32_t * status)
{
  switch(*status) {
  case MX_WAIT_STATUS_GOOD:
  case MX_WAIT_TIMEOUT_OR_INTR:
    /* do not touch status since it is still required in the caller */
    break;

  case MX_WAIT_PARITY_ERROR_DETECTED:
    mx_wait_for_recovery(ep);
    /* FIXME: clear the status so that the caller does not exit mx_wait ? */
    break;

  case MX_WAIT_ENDPT_ERROR:
    mx_fatal("MCP detected endpoint error.");
    break;

  default: {
    char s[64];
    mx_sprintf(s, "Unknown error %d received", *status);
    mx_fatal(s);
  }
  }
  return 0;
}

MX__THREAD_RETURN_T
mx__progress_thread(void *p)
{
  mx_wait_t x;
  struct mx_endpoint *ep;

  ep = p;
  MX__THREAD_INIT(&ep->thread);

  /* wake-up every second, to check for connect requests
     allow to complete a mx_connect independently
     of receiver activity
  */
  x.timeout = 1000; /* XXXX */
  x.status = 0;
  x.mcp_wake_events = 0;
  do {
    mx_return_t ret;
    MX__MUTEX_LOCK(&ep->lock);
    /* do the wake accounting from the previous iteration: assume the
       only one to generate wake_events bringing us here is the MCP
       (true because other kind of wake are associated with terminal
       conditions that won't cause the loop to be reentered) */
    mx_assert(ep->wake_pending >= x.mcp_wake_events);
    if (x.mcp_wake_events) {
      ep->wake_pending -= 1;
      mx__endpoint_free_mcp_handle(ep,ep->wake_mcp_handle);
    }
    
    ep->in_progression_thread = 1;
    ep->timer = 0;
    mx__luigi(ep);
    ep->in_progression_thread = 0;
    if (ep->app_waiting) {
      mx__app_wake(ep->handle);
    }
    if (ep->wait_waiters > 0 || ep->peek_waiters > 0 || ep->probe_waiters > 0) {
      if (!ep->app_waiting)
	mx__uwake_req(ep);
    }
    MX__MUTEX_UNLOCK(&ep->lock);
    ret = mx__wait(ep->handle, &x);
    if (ret != 0) {
      mx_fatal("mx__wait failed, check kernel logs for error messages");
    }
    MX__MUTEX_LOCK(&ep->lock);
#if MX_USE_SHMEM
    if (!mx__opt.disable_shmem)
      ep->shm->shmq->waiting = 0;
#endif
    mx_check_wait_status(ep, &x.status);
    MX__MUTEX_UNLOCK(&ep->lock);
  } while (!ep->cancelled);

  MX__THREAD_EXIT_WITH_NOTHING(&ep->thread);
}

#if 0
static void
mx__print_req(mcp_ureq_t *p)
{
  uint16_t dest_endpt;
  uint32_t dest_session;
  uint32_t offset_data;
  uint32_t dest_lookup_hint;
  uint32_t match_cst;
  uint16_t match_var0;
  uint16_t match_var1;
  uint32_t length;
  uint32_t id;
  uint32_t type;
  
  type = ntohl(p->basic.type);
  switch(type) {
  case MX_MCP_UREQ_SEND_PIO:
  case MX_MCP_UREQ_SEND_COPY:
    dest_endpt = ntohs(p->send.req.dest_endpt);
    dest_session = ntohl(p->send.req.dest_session);
    offset_data = ntohl(p->send.req.offset_data);
    dest_lookup_hint = ntohl(p->send.req.dest_lookup_hint);
    match_cst = ntohl(p->send.req.match_cst);
    match_var0 = ntohs(p->send.req.match_var0);
    match_var1 = ntohs(p->send.req.match_var1);
    length = ntohl(p->send.req.length);
    id = ntohl(p->send.req.id);
    break;
  default:
    mx_assert(0);
    break;
  }
}
#endif

void
mx__nic_id_to_macaddr(uint64_t nic_id, uint8_t macaddr[6])
{
  int i;
  for (i=5;i>=0;i--) {
    macaddr[i] = (uint8_t)(nic_id & 0xff);
    nic_id >>= 8;
  }
}

void
mx__macaddr_to_nic_id(uint8_t macaddr[6], uint64_t *nic_idp)
{
  int i;
  uint64_t nic_id = 0;
  for (i=0;i<6;i++) {
    nic_id <<= 8;
    nic_id += macaddr[i];
  }
  *nic_idp = nic_id;
}

MX_FUNC(const char *) mx__nic_id_to_str(char *str, uint64_t nic_id, uint32_t strlen)
{
  mx_snprintf(str, strlen, "%02x:%02x:%02x:%02x:%02x:%02x", 
	      (uint8_t)(nic_id >> 40),
	      (uint8_t)(nic_id >> 32),
	      (uint8_t)(nic_id >> 24),
	      (uint8_t)(nic_id >> 16),
	      (uint8_t)(nic_id >> 8),
	      (uint8_t)(nic_id >> 0));
  str[strlen - 1] = 0;
  return str;
}


void
mx__finish_buffered_requests(mx_endpoint_t ep)
{
  mx_jiffies_t delay, start;
  while (!mx__isempty_request_queue(&ep->buffered_sendq) ||
	 (ep->handle_map->free_count + ep->wake_pending 
	  != ep->handle_map->total_count) ||
	 !mx__isempty_request_queue(&ep->send_reqq) ||
	 !mx__isempty_request_queue(&ep->resend_reqq) ||
	 !TAILQ_EMPTY(&ep->partners_to_ack)) {
    mx__luigi(ep);
#ifdef MX_KERNEL
    mx_spin(20000);
#else
    mx_sched_yield();
#endif
  }
  delay = mx_jiffies_hz(ep);
  start = mx_jiffies(ep);
#if 0
  while (mx_jiffies(ep) - start < delay) {
    mx__luigi(ep);
    mx_sched_yield();
  }
#endif
}


#define RDMAWIN_ENTRIES 8

void mx_rdmawin_init(mx_endpoint_t ep)
{
  int i;
  
#if !defined MX_KERNEL && MX_RUNTIME_OPT
  if (mx__opt.rcache == 1 && !mx__regcache_works()) {
    mx_printf("warning:regcache incompatible with malloc\n");
    mx__opt.rcache = 0;
  }
  if (mx__opt.verbose) {
    MX_WARN(("rdmawin_cache_code=(compiled=%s/possible=%s/activated=%s)\n",
	     MX_RDMAWIN_CACHE ? "yes" : "no",
	     mx__regcache_works() ? "yes" : "no",
	     mx__opt.rcache ? "yes" : "no"
	     ));
  }
#endif
  if (!mx__opt.rcache)
    return;
  TAILQ_INIT(&ep->rdmawin_pinned);
  TAILQ_INIT(&ep->rdmawin_free);
  ep->rdmawin_items = mx_calloc(RDMAWIN_ENTRIES, sizeof(*ep->rdmawin_items));
  /* FIXME: proper error handling */
  mx_always_assert(ep->rdmawin_items);
  for (i=0;i < RDMAWIN_ENTRIES;i++) {
    TAILQ_INSERT_HEAD(&ep->rdmawin_free, ep->rdmawin_items+i, list);
  }
}


void mx_rdmawin_finalize(mx_endpoint_t ep)
{
  struct mx__rdmawin *r;

  if (!mx__opt.rcache)
    return;
  TAILQ_FOREACH(r, &ep->rdmawin_pinned, list) {
    mx_unpin(r->ep, r->rdma_id, r->send);
  }
  TAILQ_INIT(&ep->rdmawin_pinned);
  mx_free(ep->rdmawin_items);
#if MX__EP_STATS
  if (mx__opt.verbose) {
    MX_WARN(("hit:%u miss:%u  hit-kbytes:%llu miss-kbytes:%llu\n", 
	     ep->stats.rcache_hit, ep->stats.rcache_miss,
	     ep->stats.rcache_hit_kbytes, ep->stats.rcache_miss_kbytes));
  }
#endif /* MX__EP_STATS */
}


uint16_t mx_checksum(void *vptr, unsigned len)
{
  char *ptr = vptr;
  uint32_t sum = 0;
  /* RFC 1071 */
  while( len > 1 )  {
    /*  This is the inner loop */
    sum += * (uint16_t *)ptr;
    sum = (sum & 0xffff) + (sum >> 16);
    len -= 2;
    ptr += 2;
  }
  
  /*  Add left-over byte, if any */
  if( len > 0 )
    sum += * (uint8_t *) ptr;
  
  /*  Fold 32-bit sum to 16 bits */
  while (sum>>16)
    sum = (sum & 0xffff) + (sum >> 16);
  return ~sum;
}
